Files
quartz/quartz/components/scripts/search.inline.ts
うろちょろ ec26ebcc9e
Some checks failed
Build and Test / build-and-test (ubuntu-latest) (push) Has been skipped
Build and Test / publish-tag (push) Has been skipped
Build and Test / build-and-test (macos-latest) (push) Has been cancelled
Build and Test / build-and-test (windows-latest) (push) Has been cancelled
feat: improve search tokenization for CJK languages (#2231)
* feat: improve search tokenization for CJK languages

Enhance the encoder function to properly tokenize CJK (Chinese, Japanese,
Korean) characters while maintaining English word tokenization. This fixes
search issues where CJK text was not searchable due to whitespace-only
splitting.

Changes:
- Tokenize CJK characters (Hiragana, Katakana, Kanji, Hangul) individually
- Preserve whitespace-based tokenization for non-CJK text
- Support mixed CJK/English content in search queries

This addresses the CJK search issues reported in #2109 where Japanese text
like "て以来" was not searchable because the encoder only split on whitespace.

Tested with Japanese, Chinese, and Korean content to verify character-level
tokenization works correctly while maintaining English search functionality.

* perf: optimize CJK search encoder with manual buffer tracking

Replace regex-based tokenization with index-based buffer management.
This improves performance by ~2.93x according to benchmark results.

- Use explicit buffer start/end indices instead of string concatenation
- Replace split(/\s+/) with direct whitespace code point checks
- Remove redundant filter() operations
- Add CJK Extension A support (U+20000-U+2A6DF)

Performance: ~878ms → ~300ms (100 iterations, mixed CJK/English text)

* test: add comprehensive unit tests for CJK search encoder

Add 21 unit tests covering:
- English word tokenization
- CJK character-level tokenization (Japanese, Korean, Chinese)
- Mixed CJK/English content
- Edge cases

All tests pass, confirming the encoder correctly handles CJK text.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-12-02 10:04:38 -08:00

541 lines
18 KiB
TypeScript

import FlexSearch, { DefaultDocumentSearchResults } from "flexsearch"
import { ContentDetails } from "../../plugins/emitters/contentIndex"
import { registerEscapeHandler, removeAllChildren } from "./util"
import { FullSlug, normalizeRelativeURLs, resolveRelative } from "../../util/path"
interface Item {
id: number
slug: FullSlug
title: string
content: string
tags: string[]
[key: string]: any
}
// Can be expanded with things like "term" in the future
type SearchType = "basic" | "tags"
let searchType: SearchType = "basic"
let currentSearchTerm: string = ""
const encoder = (str: string): string[] => {
const tokens: string[] = []
let bufferStart = -1
let bufferEnd = -1
const lower = str.toLowerCase()
let i = 0
for (const char of lower) {
const code = char.codePointAt(0)!
const isCJK =
(code >= 0x3040 && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30ff) ||
(code >= 0x4e00 && code <= 0x9fff) ||
(code >= 0xac00 && code <= 0xd7af) ||
(code >= 0x20000 && code <= 0x2a6df)
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
if (isCJK) {
if (bufferStart !== -1) {
tokens.push(lower.slice(bufferStart, bufferEnd))
bufferStart = -1
}
tokens.push(char)
} else if (isWhitespace) {
if (bufferStart !== -1) {
tokens.push(lower.slice(bufferStart, bufferEnd))
bufferStart = -1
}
} else {
if (bufferStart === -1) bufferStart = i
bufferEnd = i + char.length
}
i += char.length
}
if (bufferStart !== -1) {
tokens.push(lower.slice(bufferStart))
}
return tokens
}
let index = new FlexSearch.Document<Item>({
encode: encoder,
document: {
id: "id",
tag: "tags",
index: [
{
field: "title",
tokenize: "forward",
},
{
field: "content",
tokenize: "forward",
},
{
field: "tags",
tokenize: "forward",
},
],
},
})
const p = new DOMParser()
const fetchContentCache: Map<FullSlug, Element[]> = new Map()
const contextWindowWords = 30
const numSearchResults = 8
const numTagResults = 5
const tokenizeTerm = (term: string) => {
const tokens = term.split(/\s+/).filter((t) => t.trim() !== "")
const tokenLen = tokens.length
if (tokenLen > 1) {
for (let i = 1; i < tokenLen; i++) {
tokens.push(tokens.slice(0, i + 1).join(" "))
}
}
return tokens.sort((a, b) => b.length - a.length) // always highlight longest terms first
}
function highlight(searchTerm: string, text: string, trim?: boolean) {
const tokenizedTerms = tokenizeTerm(searchTerm)
let tokenizedText = text.split(/\s+/).filter((t) => t !== "")
let startIndex = 0
let endIndex = tokenizedText.length - 1
if (trim) {
const includesCheck = (tok: string) =>
tokenizedTerms.some((term) => tok.toLowerCase().startsWith(term.toLowerCase()))
const occurrencesIndices = tokenizedText.map(includesCheck)
let bestSum = 0
let bestIndex = 0
for (let i = 0; i < Math.max(tokenizedText.length - contextWindowWords, 0); i++) {
const window = occurrencesIndices.slice(i, i + contextWindowWords)
const windowSum = window.reduce((total, cur) => total + (cur ? 1 : 0), 0)
if (windowSum >= bestSum) {
bestSum = windowSum
bestIndex = i
}
}
startIndex = Math.max(bestIndex - contextWindowWords, 0)
endIndex = Math.min(startIndex + 2 * contextWindowWords, tokenizedText.length - 1)
tokenizedText = tokenizedText.slice(startIndex, endIndex)
}
const slice = tokenizedText
.map((tok) => {
// see if this tok is prefixed by any search terms
for (const searchTok of tokenizedTerms) {
if (tok.toLowerCase().includes(searchTok.toLowerCase())) {
const regex = new RegExp(searchTok.toLowerCase(), "gi")
return tok.replace(regex, `<span class="highlight">$&</span>`)
}
}
return tok
})
.join(" ")
return `${startIndex === 0 ? "" : "..."}${slice}${
endIndex === tokenizedText.length - 1 ? "" : "..."
}`
}
function highlightHTML(searchTerm: string, el: HTMLElement) {
const p = new DOMParser()
const tokenizedTerms = tokenizeTerm(searchTerm)
const html = p.parseFromString(el.innerHTML, "text/html")
const createHighlightSpan = (text: string) => {
const span = document.createElement("span")
span.className = "highlight"
span.textContent = text
return span
}
const highlightTextNodes = (node: Node, term: string) => {
if (node.nodeType === Node.TEXT_NODE) {
const nodeText = node.nodeValue ?? ""
const regex = new RegExp(term.toLowerCase(), "gi")
const matches = nodeText.match(regex)
if (!matches || matches.length === 0) return
const spanContainer = document.createElement("span")
let lastIndex = 0
for (const match of matches) {
const matchIndex = nodeText.indexOf(match, lastIndex)
spanContainer.appendChild(document.createTextNode(nodeText.slice(lastIndex, matchIndex)))
spanContainer.appendChild(createHighlightSpan(match))
lastIndex = matchIndex + match.length
}
spanContainer.appendChild(document.createTextNode(nodeText.slice(lastIndex)))
node.parentNode?.replaceChild(spanContainer, node)
} else if (node.nodeType === Node.ELEMENT_NODE) {
if ((node as HTMLElement).classList.contains("highlight")) return
Array.from(node.childNodes).forEach((child) => highlightTextNodes(child, term))
}
}
for (const term of tokenizedTerms) {
highlightTextNodes(html.body, term)
}
return html.body
}
async function setupSearch(searchElement: Element, currentSlug: FullSlug, data: ContentIndex) {
const container = searchElement.querySelector(".search-container") as HTMLElement
if (!container) return
const sidebar = container.closest(".sidebar") as HTMLElement | null
const searchButton = searchElement.querySelector(".search-button") as HTMLButtonElement
if (!searchButton) return
const searchBar = searchElement.querySelector(".search-bar") as HTMLInputElement
if (!searchBar) return
const searchLayout = searchElement.querySelector(".search-layout") as HTMLElement
if (!searchLayout) return
const idDataMap = Object.keys(data) as FullSlug[]
const appendLayout = (el: HTMLElement) => {
searchLayout.appendChild(el)
}
const enablePreview = searchLayout.dataset.preview === "true"
let preview: HTMLDivElement | undefined = undefined
let previewInner: HTMLDivElement | undefined = undefined
const results = document.createElement("div")
results.className = "results-container"
appendLayout(results)
if (enablePreview) {
preview = document.createElement("div")
preview.className = "preview-container"
appendLayout(preview)
}
function hideSearch() {
container.classList.remove("active")
searchBar.value = "" // clear the input when we dismiss the search
if (sidebar) sidebar.style.zIndex = ""
removeAllChildren(results)
if (preview) {
removeAllChildren(preview)
}
searchLayout.classList.remove("display-results")
searchType = "basic" // reset search type after closing
searchButton.focus()
}
function showSearch(searchTypeNew: SearchType) {
searchType = searchTypeNew
if (sidebar) sidebar.style.zIndex = "1"
container.classList.add("active")
searchBar.focus()
}
let currentHover: HTMLInputElement | null = null
async function shortcutHandler(e: HTMLElementEventMap["keydown"]) {
if (e.key === "k" && (e.ctrlKey || e.metaKey) && !e.shiftKey) {
e.preventDefault()
const searchBarOpen = container.classList.contains("active")
searchBarOpen ? hideSearch() : showSearch("basic")
return
} else if (e.shiftKey && (e.ctrlKey || e.metaKey) && e.key.toLowerCase() === "k") {
// Hotkey to open tag search
e.preventDefault()
const searchBarOpen = container.classList.contains("active")
searchBarOpen ? hideSearch() : showSearch("tags")
// add "#" prefix for tag search
searchBar.value = "#"
return
}
if (currentHover) {
currentHover.classList.remove("focus")
}
// If search is active, then we will render the first result and display accordingly
if (!container.classList.contains("active")) return
if (e.key === "Enter" && !e.isComposing) {
// If result has focus, navigate to that one, otherwise pick first result
if (results.contains(document.activeElement)) {
const active = document.activeElement as HTMLInputElement
if (active.classList.contains("no-match")) return
await displayPreview(active)
active.click()
} else {
const anchor = document.getElementsByClassName("result-card")[0] as HTMLInputElement | null
if (!anchor || anchor.classList.contains("no-match")) return
await displayPreview(anchor)
anchor.click()
}
} else if (e.key === "ArrowUp" || (e.shiftKey && e.key === "Tab")) {
e.preventDefault()
if (results.contains(document.activeElement)) {
// If an element in results-container already has focus, focus previous one
const currentResult = currentHover
? currentHover
: (document.activeElement as HTMLInputElement | null)
const prevResult = currentResult?.previousElementSibling as HTMLInputElement | null
currentResult?.classList.remove("focus")
prevResult?.focus()
if (prevResult) currentHover = prevResult
await displayPreview(prevResult)
}
} else if (e.key === "ArrowDown" || e.key === "Tab") {
e.preventDefault()
// The results should already been focused, so we need to find the next one.
// The activeElement is the search bar, so we need to find the first result and focus it.
if (document.activeElement === searchBar || currentHover !== null) {
const firstResult = currentHover
? currentHover
: (document.getElementsByClassName("result-card")[0] as HTMLInputElement | null)
const secondResult = firstResult?.nextElementSibling as HTMLInputElement | null
firstResult?.classList.remove("focus")
secondResult?.focus()
if (secondResult) currentHover = secondResult
await displayPreview(secondResult)
}
}
}
const formatForDisplay = (term: string, id: number) => {
const slug = idDataMap[id]
return {
id,
slug,
title: searchType === "tags" ? data[slug].title : highlight(term, data[slug].title ?? ""),
content: highlight(term, data[slug].content ?? "", true),
tags: highlightTags(term.substring(1), data[slug].tags),
}
}
function highlightTags(term: string, tags: string[]) {
if (!tags || searchType !== "tags") {
return []
}
return tags
.map((tag) => {
if (tag.toLowerCase().includes(term.toLowerCase())) {
return `<li><p class="match-tag">#${tag}</p></li>`
} else {
return `<li><p>#${tag}</p></li>`
}
})
.slice(0, numTagResults)
}
function resolveUrl(slug: FullSlug): URL {
return new URL(resolveRelative(currentSlug, slug), location.toString())
}
const resultToHTML = ({ slug, title, content, tags }: Item) => {
const htmlTags = tags.length > 0 ? `<ul class="tags">${tags.join("")}</ul>` : ``
const itemTile = document.createElement("a")
itemTile.classList.add("result-card")
itemTile.id = slug
itemTile.href = resolveUrl(slug).toString()
itemTile.innerHTML = `
<h3 class="card-title">${title}</h3>
${htmlTags}
<p class="card-description">${content}</p>
`
itemTile.addEventListener("click", (event) => {
if (event.altKey || event.ctrlKey || event.metaKey || event.shiftKey) return
hideSearch()
})
const handler = (event: MouseEvent) => {
if (event.altKey || event.ctrlKey || event.metaKey || event.shiftKey) return
hideSearch()
}
async function onMouseEnter(ev: MouseEvent) {
if (!ev.target) return
const target = ev.target as HTMLInputElement
await displayPreview(target)
}
itemTile.addEventListener("mouseenter", onMouseEnter)
window.addCleanup(() => itemTile.removeEventListener("mouseenter", onMouseEnter))
itemTile.addEventListener("click", handler)
window.addCleanup(() => itemTile.removeEventListener("click", handler))
return itemTile
}
async function displayResults(finalResults: Item[]) {
removeAllChildren(results)
if (finalResults.length === 0) {
results.innerHTML = `<a class="result-card no-match">
<h3>No results.</h3>
<p>Try another search term?</p>
</a>`
} else {
results.append(...finalResults.map(resultToHTML))
}
if (finalResults.length === 0 && preview) {
// no results, clear previous preview
removeAllChildren(preview)
} else {
// focus on first result, then also dispatch preview immediately
const firstChild = results.firstElementChild as HTMLElement
firstChild.classList.add("focus")
currentHover = firstChild as HTMLInputElement
await displayPreview(firstChild)
}
}
async function fetchContent(slug: FullSlug): Promise<Element[]> {
if (fetchContentCache.has(slug)) {
return fetchContentCache.get(slug) as Element[]
}
const targetUrl = resolveUrl(slug).toString()
const contents = await fetch(targetUrl)
.then((res) => res.text())
.then((contents) => {
if (contents === undefined) {
throw new Error(`Could not fetch ${targetUrl}`)
}
const html = p.parseFromString(contents ?? "", "text/html")
normalizeRelativeURLs(html, targetUrl)
return [...html.getElementsByClassName("popover-hint")]
})
fetchContentCache.set(slug, contents)
return contents
}
async function displayPreview(el: HTMLElement | null) {
if (!searchLayout || !enablePreview || !el || !preview) return
const slug = el.id as FullSlug
const innerDiv = await fetchContent(slug).then((contents) =>
contents.flatMap((el) => [...highlightHTML(currentSearchTerm, el as HTMLElement).children]),
)
previewInner = document.createElement("div")
previewInner.classList.add("preview-inner")
previewInner.append(...innerDiv)
preview.replaceChildren(previewInner)
// scroll to longest
const highlights = [...preview.getElementsByClassName("highlight")].sort(
(a, b) => b.innerHTML.length - a.innerHTML.length,
)
highlights[0]?.scrollIntoView({ block: "start" })
}
async function onType(e: HTMLElementEventMap["input"]) {
if (!searchLayout || !index) return
currentSearchTerm = (e.target as HTMLInputElement).value
searchLayout.classList.toggle("display-results", currentSearchTerm !== "")
searchType = currentSearchTerm.startsWith("#") ? "tags" : "basic"
let searchResults: DefaultDocumentSearchResults<Item>
if (searchType === "tags") {
currentSearchTerm = currentSearchTerm.substring(1).trim()
const separatorIndex = currentSearchTerm.indexOf(" ")
if (separatorIndex != -1) {
// search by title and content index and then filter by tag (implemented in flexsearch)
const tag = currentSearchTerm.substring(0, separatorIndex)
const query = currentSearchTerm.substring(separatorIndex + 1).trim()
searchResults = await index.searchAsync({
query: query,
// return at least 10000 documents, so it is enough to filter them by tag (implemented in flexsearch)
limit: Math.max(numSearchResults, 10000),
index: ["title", "content"],
tag: { tags: tag },
})
for (let searchResult of searchResults) {
searchResult.result = searchResult.result.slice(0, numSearchResults)
}
// set search type to basic and remove tag from term for proper highlightning and scroll
searchType = "basic"
currentSearchTerm = query
} else {
// default search by tags index
searchResults = await index.searchAsync({
query: currentSearchTerm,
limit: numSearchResults,
index: ["tags"],
})
}
} else if (searchType === "basic") {
searchResults = await index.searchAsync({
query: currentSearchTerm,
limit: numSearchResults,
index: ["title", "content"],
})
}
const getByField = (field: string): number[] => {
const results = searchResults.filter((x) => x.field === field)
return results.length === 0 ? [] : ([...results[0].result] as number[])
}
// order titles ahead of content
const allIds: Set<number> = new Set([
...getByField("title"),
...getByField("content"),
...getByField("tags"),
])
const finalResults = [...allIds].map((id) => formatForDisplay(currentSearchTerm, id))
await displayResults(finalResults)
}
document.addEventListener("keydown", shortcutHandler)
window.addCleanup(() => document.removeEventListener("keydown", shortcutHandler))
searchButton.addEventListener("click", () => showSearch("basic"))
window.addCleanup(() => searchButton.removeEventListener("click", () => showSearch("basic")))
searchBar.addEventListener("input", onType)
window.addCleanup(() => searchBar.removeEventListener("input", onType))
registerEscapeHandler(container, hideSearch)
await fillDocument(data)
}
/**
* Fills flexsearch document with data
* @param index index to fill
* @param data data to fill index with
*/
let indexPopulated = false
async function fillDocument(data: ContentIndex) {
if (indexPopulated) return
let id = 0
const promises: Array<Promise<unknown>> = []
for (const [slug, fileData] of Object.entries<ContentDetails>(data)) {
promises.push(
index.addAsync(id++, {
id,
slug: slug as FullSlug,
title: fileData.title,
content: fileData.content,
tags: fileData.tags,
}),
)
}
await Promise.all(promises)
indexPopulated = true
}
document.addEventListener("nav", async (e: CustomEventMap["nav"]) => {
const currentSlug = e.detail.url
const data = await fetchData
const searchElement = document.getElementsByClassName("search")
for (const element of searchElement) {
await setupSearch(element, currentSlug, data)
}
})