From f48db2ef190e77d399c9e0a0beb1d0efddcd45a6 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Sat, 16 Nov 2024 19:54:49 -0500 Subject: [PATCH] feat(llm): add llm.txt Signed-off-by: Aaron Pham --- package-lock.json | 125 ++++++++++++ package.json | 1 + quartz/components/renderPage.tsx | 35 ++-- quartz/plugins/emitters/index.ts | 1 + quartz/plugins/emitters/llm.tsx | 317 +++++++++++++++++++++++++++++++ 5 files changed, 464 insertions(+), 15 deletions(-) create mode 100644 quartz/plugins/emitters/llm.tsx diff --git a/package-lock.json b/package-lock.json index 7466ec441..d4034b11c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -26,6 +26,7 @@ "gray-matter": "^4.0.3", "hast-util-to-html": "^9.0.3", "hast-util-to-jsx-runtime": "^2.3.2", + "hast-util-to-mdast": "^10.1.0", "hast-util-to-string": "^3.0.1", "is-absolute-url": "^4.0.1", "js-yaml": "^4.1.0", @@ -3643,6 +3644,20 @@ "node": ">= 0.4" } }, + "node_modules/hast-util-embedded": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-embedded/-/hast-util-embedded-3.0.0.tgz", + "integrity": "sha512-naH8sld4Pe2ep03qqULEtvYr7EjrLK2QHY8KJR6RJkTUjPGObe1vnx585uzem2hGra+s1q08DZZpfgDVYRbaXA==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-is-element": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-from-dom": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.0.tgz", @@ -3743,6 +3758,19 @@ "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz", "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ==" }, + "node_modules/hast-util-has-property": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-has-property/-/hast-util-has-property-3.0.0.tgz", + "integrity": "sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-heading-rank": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-heading-rank/-/hast-util-heading-rank-3.0.0.tgz", @@ -3755,6 +3783,19 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hast-util-is-body-ok-link": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/hast-util-is-body-ok-link/-/hast-util-is-body-ok-link-3.0.1.tgz", + "integrity": "sha512-0qpnzOBLztXHbHQenVB8uNuxTnm/QBFUOmdOSsEn7GnBtyY07+ENTWVFBAnXd/zEgd9/SUG3lRY7hSIBWRgGpQ==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-is-element": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", @@ -3767,6 +3808,23 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hast-util-minify-whitespace": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/hast-util-minify-whitespace/-/hast-util-minify-whitespace-1.0.1.tgz", + "integrity": "sha512-L96fPOVpnclQE0xzdWb/D12VT5FabA7SnZOUMtL1DbXmYiHJMXZvFkIZfiMmTCNJHUeO2K9UYNXoVyfz+QHuOw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-embedded": "^3.0.0", + "hast-util-is-element": "^3.0.0", + "hast-util-whitespace": "^3.0.0", + "unist-util-is": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-parse-selector": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", @@ -3779,6 +3837,23 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hast-util-phrasing": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/hast-util-phrasing/-/hast-util-phrasing-3.0.1.tgz", + "integrity": "sha512-6h60VfI3uBQUxHqTyMymMZnEbNl1XmEGtOxxKYL7stY2o601COo62AWAYBQR9lZbYXYSBoxag8UpPRXK+9fqSQ==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-embedded": "^3.0.0", + "hast-util-has-property": "^3.0.0", + "hast-util-is-body-ok-link": "^3.0.0", + "hast-util-is-element": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-raw": { "version": "9.0.1", "resolved": "https://registry.npmjs.org/hast-util-raw/-/hast-util-raw-9.0.1.tgz", @@ -3891,6 +3966,32 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hast-util-to-mdast": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/hast-util-to-mdast/-/hast-util-to-mdast-10.1.0.tgz", + "integrity": "sha512-DsL/SvCK9V7+vfc6SLQ+vKIyBDXTk2KLSbfBYkH4zeF/uR1yBajHRhkzuaUSGOB1WJSTieJBdHwxlC+HLKvZZw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "@ungap/structured-clone": "^1.0.0", + "hast-util-phrasing": "^3.0.0", + "hast-util-to-html": "^9.0.0", + "hast-util-to-text": "^4.0.0", + "hast-util-whitespace": "^3.0.0", + "mdast-util-phrasing": "^4.0.0", + "mdast-util-to-hast": "^13.0.0", + "mdast-util-to-string": "^4.0.0", + "rehype-minify-whitespace": "^6.0.0", + "trim-trailing-lines": "^2.0.0", + "unist-util-position": "^5.0.0", + "unist-util-visit": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-to-parse5": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/hast-util-to-parse5/-/hast-util-to-parse5-8.0.0.tgz", @@ -6185,6 +6286,20 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/rehype-minify-whitespace": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/rehype-minify-whitespace/-/rehype-minify-whitespace-6.0.2.tgz", + "integrity": "sha512-Zk0pyQ06A3Lyxhe9vGtOtzz3Z0+qZ5+7icZ/PL/2x1SHPbKao5oB/g/rlc6BCTajqBb33JcOe71Ye1oFsuYbnw==", + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-minify-whitespace": "^1.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/rehype-parse": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/rehype-parse/-/rehype-parse-9.0.0.tgz", @@ -7499,6 +7614,16 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/trim-trailing-lines": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/trim-trailing-lines/-/trim-trailing-lines-2.1.0.tgz", + "integrity": "sha512-5UR5Biq4VlVOtzqkm2AZlgvSlDJtME46uV0br0gENbwN4l5+mMKT4b9gJKqWtuL2zAIqajGJGuvbCbcAJUZqBg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/trough": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/trough/-/trough-2.1.0.tgz", diff --git a/package.json b/package.json index e7f8abd9f..7eec1bd40 100644 --- a/package.json +++ b/package.json @@ -52,6 +52,7 @@ "gray-matter": "^4.0.3", "hast-util-to-html": "^9.0.3", "hast-util-to-jsx-runtime": "^2.3.2", + "hast-util-to-mdast": "^10.1.0", "hast-util-to-string": "^3.0.1", "is-absolute-url": "^4.0.1", "js-yaml": "^4.1.0", diff --git a/quartz/components/renderPage.tsx b/quartz/components/renderPage.tsx index 9c530967b..0363e1809 100644 --- a/quartz/components/renderPage.tsx +++ b/quartz/components/renderPage.tsx @@ -58,17 +58,11 @@ export function pageResources( } } -export function renderPage( - cfg: GlobalConfiguration, - slug: FullSlug, - componentData: QuartzComponentProps, - components: RenderComponents, - pageResources: StaticResources, -): string { - // make a deep copy of the tree so we don't remove the transclusion references - // for the file cached in contentMap in build.ts - const root = clone(componentData.tree) as Root - +export function transcludeFinal( + root: Root, + { cfg, fileData, allFiles }: QuartzComponentProps, +): Root { + const slug = fileData.slug! // process transcludes in componentData visit(root, "element", (node, _index, _parent) => { if (node.tagName === "blockquote") { @@ -76,7 +70,7 @@ export function renderPage( if (classNames.includes("transclude")) { const inner = node.children[0] as Element const transcludeTarget = inner.properties["data-slug"] as FullSlug - const page = componentData.allFiles.find((f) => f.slug === transcludeTarget) + const page = allFiles.find((f) => f.slug === transcludeTarget) if (!page) { return } @@ -119,7 +113,7 @@ export function renderPage( if (!(el.type === "element" && el.tagName.match(headerRegex))) continue const depth = Number(el.tagName.substring(1)) - // lookin for our blockref + // looking for our blockref if (startIdx === undefined || startDepth === undefined) { // skip until we find the blockref that matches if (el.properties?.id === blockRef) { @@ -184,9 +178,20 @@ export function renderPage( } } }) + return root +} - // set componentData.tree to the edited html that has transclusions rendered - componentData.tree = root +export function renderPage( + cfg: GlobalConfiguration, + slug: FullSlug, + componentData: QuartzComponentProps, + components: RenderComponents, + pageResources: StaticResources, +): string { + // make a deep copy of the tree so we don't remove the transclusion references + // for the file cached in contentMap in build.ts + // then set componentData.tree to the edited html that has transclusions rendered + componentData.tree = transcludeFinal(clone(componentData.tree) as Root, componentData) const { head: Head, diff --git a/quartz/plugins/emitters/index.ts b/quartz/plugins/emitters/index.ts index bc378c47b..943d0401a 100644 --- a/quartz/plugins/emitters/index.ts +++ b/quartz/plugins/emitters/index.ts @@ -8,3 +8,4 @@ export { Static } from "./static" export { ComponentResources } from "./componentResources" export { NotFoundPage } from "./404" export { CNAME } from "./cname" +export { LLMText } from "./llm" diff --git a/quartz/plugins/emitters/llm.tsx b/quartz/plugins/emitters/llm.tsx new file mode 100644 index 000000000..8a384e233 --- /dev/null +++ b/quartz/plugins/emitters/llm.tsx @@ -0,0 +1,317 @@ +import { visit } from "unist-util-visit" +import { Root, Element, Node, Text } from "hast" +import { Blockquote, Code } from "mdast" +import { QuartzEmitterPlugin } from "../types" +import { QuartzComponentProps } from "../../components/types" +import HeaderConstructor from "../../components/Header" +import BodyConstructor from "../../components/Body" +import { pageResources, transcludeFinal } from "../../components/renderPage" +import { FullPageLayout } from "../../cfg" +import { clone, FilePath, pathToRoot } from "../../util/path" +import { write } from "./helpers" +import { toMdast, defaultHandlers as hastToMdastHandlers } from "hast-util-to-mdast" +import { toMarkdown, defaultHandlers as mdastToTextHandlers } from "mdast-util-to-markdown" +import { gfmToMarkdown } from "mdast-util-gfm" +import { InlineMath, Math, mathToMarkdown } from "mdast-util-math" +import { defaultContentPageLayout, sharedPageComponents } from "../../../quartz.layout" +import { Content } from "../../components" +import DepGraph from "../../depgraph" + +export const LLMText: QuartzEmitterPlugin> = (userOpts) => { + const opts: FullPageLayout = { + ...sharedPageComponents, + ...defaultContentPageLayout, + pageBody: Content(), + ...userOpts, + } + + const { head: Head, header, beforeBody, pageBody, afterBody, left, right, footer: Footer } = opts + const Header = HeaderConstructor() + const Body = BodyConstructor() + + return { + name: "LLMText", + getQuartzComponents() { + return [ + Head, + Header, + Body, + ...header, + ...beforeBody, + pageBody, + ...afterBody, + ...left, + ...right, + Footer, + ] + }, + async getDependencyGraph() { + return new DepGraph() + }, + async emit(ctx, content, resources): Promise { + const cfg = ctx.cfg.configuration + const fps: Promise[] = [] + const allFiles = content.map((c) => c[1].data) + + for (const [tree, file] of content) { + const slug = file.data.slug! + + const externalResources = pageResources(pathToRoot(slug), resources) + const componentData: QuartzComponentProps = { + ctx, + fileData: file.data, + externalResources, + cfg, + children: [], + tree, + allFiles, + } + + const root = transcludeFinal(clone(tree) as Root, componentData) + const mdast = toMdast(root, { + handlers: { + // handle ast parsed by rehype-pretty-code + figure(h, node) { + if (node.properties?.dataRehypePrettyCodeFigure !== "") + return hastToMdastHandlers.figure(h, node) + + let pre: Element | undefined + let code: Element | undefined + let figcaption: Element | undefined + + visit(node, "element", (el: Element) => { + if ( + el.tagName === "figcaption" && + el.properties?.dataRehypePrettyCodeTitle === "" + ) { + figcaption = el + return false + } + }) + visit(node, "element", (el: Element) => { + if (el.tagName === "pre") { + pre = el + return false + } + }) + // Find pre, code, and figcaption elements + visit(pre as Node, "element", (el: Element) => { + if (el.tagName === "code") { + code = el + return false + } + }) + + if (!code || !pre) return hastToMdastHandlers.figure(h, node) + + // Get language + const lang = pre.properties?.dataLanguage + + // Get title from figcaption + let title = "" + if (figcaption) { + title = (figcaption.children[0] as Text)?.value + } + + // Get highlighted lines + // FIX: CORRECT THE CHAIN, not work very well for now + const highlightedLines: number[] = [] + // Get highlighted words + const highlightedWords: string[] = [] + for (const [i, span] of code.children.entries()) { + if ((span as Element).properties?.dataHighlightedLine == "") { + highlightedLines.push(i) + } + + // FIX: THIS ALSO DOESN'T WORK YET + visit(span, "element", (el: Element) => { + if (el.tagName === "mark" && el.properties?.dataHighlightedCharsMark) { + let word = "" + el.children.map((span) => { + word += ((span as Element).children[0] as Text)?.value + }) + highlightedWords.push(word) + } + }) + } + + // Build code content from spans + let codeContent = "" + visit(code, "element", (span: Element) => { + if (span.properties?.dataLine !== undefined) { + visit(span, "text", (text: Text) => { + codeContent += text.value + }) + codeContent += "\n" + } + }) + + // Build meta string + const meta = [ + title ? `title="${title}"` : "", + highlightedLines.length ? `{${highlightedLines.join(",")}}` : "", + highlightedWords.length ? `/${highlightedWords.join("/")}/` : "", + ] + .filter(Boolean) + .join(" ") + + const result: Code = { + type: "code", + lang: (lang as string | null) ?? null, + meta: meta || null, + value: codeContent.trimEnd(), + } + + h.patch(node, result) + return result + }, + // handle math node correctly + span(h, node) { + const classNames = (node.properties.className ?? []) as string[] + // katex: inline-math, katex-display: block-math + if (classNames.includes("katex") || classNames.includes("katex-display")) { + const inline = !classNames.includes("katex-display") + let source: string | null = null + + visit(node, "element", (node) => { + if ( + node.tagName === "annotation" && + node.properties?.encoding === "application/x-tex" + ) { + if (node.children?.[0]?.type === "text") { + source = node.children[0].value + return false // stop traversal + } + } + }) + if (!source) { + console.warn( + `[emit:ContentPage] Could not extract LaTeX source from KaTeX node (slug: ${slug})`, + ) + return hastToMdastHandlers.span(h, node) + } + + const results: Math | InlineMath = { + type: inline ? "inlineMath" : "math", + value: source, + } + h.patch(node, results) + return results + } else { + return hastToMdastHandlers.span(h, node) + } + }, + // handle mermaid + pre(h, node) { + let codeEl: Element | undefined + visit(node, "element", (el) => { + if ( + el.tagName === "code" && + ((el.properties?.className ?? []) as string[]).includes("mermaid") + ) { + codeEl = el + return false + } + }) + if (!codeEl) return hastToMdastHandlers.pre(h, node) + const results: Code = { + type: "code", + lang: "mermaid", + value: JSON.parse(codeEl.properties?.dataClipboard as string), + } + h.patch(node, results) + return results + }, + // handle callout correctly + blockquote(h, node) { + const classNames = (node.properties?.className ?? []) as string[] + if (!classNames.includes("callout")) { + return hastToMdastHandlers.blockquote(h, node) + } + + // Get callout type + const type = node.properties?.dataCallout as string + + // Get title from callout-title-inner + let title = "" + let titleNode: Element | undefined + visit(node, "element", (el: Element) => { + if ((el.properties?.className as string[])?.includes("callout-title-inner")) { + titleNode = el + return false + } + }) + if (titleNode) { + title = ((titleNode.children[0] as Element)?.children[0] as Text)?.value + } + + // Check collapse state + const isCollapsible = classNames.includes("is-collapsible") + const isCollapsed = classNames.includes("is-collapsed") + const collapseChar = isCollapsible ? (isCollapsed ? "-" : "+") : "" + + // Get remaining content + let content: any[] = [] + visit(node, "element", (el: Element) => { + if ((el.properties?.className as string[])?.includes("callout-content")) { + // Convert children using default blockquote handler to maintain parsing + content = h.all(el) + return false + } + }) + + const result: Blockquote = { + type: "blockquote", + children: [ + { + type: "paragraph", + children: [ + { + type: "text", + value: `[!${type}]${collapseChar}${title ? ` ${title.trim()}` : ""}`, + data: { unescaped: true }, + }, + ], + }, + ...content, + ], + } + + h.patch(node, result) + return result + }, + }, + }) + const fp = write({ + ctx, + content: toMarkdown(mdast, { + extensions: [ + { + handlers: { + code(node, _parent, _context, _info) { + const { lang, meta, value } = node + const info = [lang, meta].filter(Boolean).join(" ") + return "```" + (info ? info + "\n" : "\n") + value + "\n```" + }, + text(node, parent, context, info) { + if (node.data?.unescaped) { + return node.value + } + return mdastToTextHandlers.text(node, parent, context, info) + }, + }, + }, + mathToMarkdown(), + gfmToMarkdown(), + ], + }), + slug, + ext: ".html.md", + }) + fps.push(fp) + } + + return await Promise.all(fps) + }, + } +}