feat(llm): add llm.txt

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Aaron Pham 2024-11-16 19:54:49 -05:00
parent 137d55eb1b
commit f48db2ef19
No known key found for this signature in database
GPG Key ID: 18974753009D2BFA
5 changed files with 464 additions and 15 deletions

125
package-lock.json generated
View File

@ -26,6 +26,7 @@
"gray-matter": "^4.0.3",
"hast-util-to-html": "^9.0.3",
"hast-util-to-jsx-runtime": "^2.3.2",
"hast-util-to-mdast": "^10.1.0",
"hast-util-to-string": "^3.0.1",
"is-absolute-url": "^4.0.1",
"js-yaml": "^4.1.0",
@ -3643,6 +3644,20 @@
"node": ">= 0.4"
}
},
"node_modules/hast-util-embedded": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/hast-util-embedded/-/hast-util-embedded-3.0.0.tgz",
"integrity": "sha512-naH8sld4Pe2ep03qqULEtvYr7EjrLK2QHY8KJR6RJkTUjPGObe1vnx585uzem2hGra+s1q08DZZpfgDVYRbaXA==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"hast-util-is-element": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-from-dom": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.0.tgz",
@ -3743,6 +3758,19 @@
"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.2.tgz",
"integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ=="
},
"node_modules/hast-util-has-property": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/hast-util-has-property/-/hast-util-has-property-3.0.0.tgz",
"integrity": "sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-heading-rank": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/hast-util-heading-rank/-/hast-util-heading-rank-3.0.0.tgz",
@ -3755,6 +3783,19 @@
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-is-body-ok-link": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/hast-util-is-body-ok-link/-/hast-util-is-body-ok-link-3.0.1.tgz",
"integrity": "sha512-0qpnzOBLztXHbHQenVB8uNuxTnm/QBFUOmdOSsEn7GnBtyY07+ENTWVFBAnXd/zEgd9/SUG3lRY7hSIBWRgGpQ==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-is-element": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz",
@ -3767,6 +3808,23 @@
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-minify-whitespace": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/hast-util-minify-whitespace/-/hast-util-minify-whitespace-1.0.1.tgz",
"integrity": "sha512-L96fPOVpnclQE0xzdWb/D12VT5FabA7SnZOUMtL1DbXmYiHJMXZvFkIZfiMmTCNJHUeO2K9UYNXoVyfz+QHuOw==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"hast-util-embedded": "^3.0.0",
"hast-util-is-element": "^3.0.0",
"hast-util-whitespace": "^3.0.0",
"unist-util-is": "^6.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-parse-selector": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz",
@ -3779,6 +3837,23 @@
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-phrasing": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/hast-util-phrasing/-/hast-util-phrasing-3.0.1.tgz",
"integrity": "sha512-6h60VfI3uBQUxHqTyMymMZnEbNl1XmEGtOxxKYL7stY2o601COo62AWAYBQR9lZbYXYSBoxag8UpPRXK+9fqSQ==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"hast-util-embedded": "^3.0.0",
"hast-util-has-property": "^3.0.0",
"hast-util-is-body-ok-link": "^3.0.0",
"hast-util-is-element": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-raw": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/hast-util-raw/-/hast-util-raw-9.0.1.tgz",
@ -3891,6 +3966,32 @@
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-to-mdast": {
"version": "10.1.0",
"resolved": "https://registry.npmjs.org/hast-util-to-mdast/-/hast-util-to-mdast-10.1.0.tgz",
"integrity": "sha512-DsL/SvCK9V7+vfc6SLQ+vKIyBDXTk2KLSbfBYkH4zeF/uR1yBajHRhkzuaUSGOB1WJSTieJBdHwxlC+HLKvZZw==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"@types/mdast": "^4.0.0",
"@ungap/structured-clone": "^1.0.0",
"hast-util-phrasing": "^3.0.0",
"hast-util-to-html": "^9.0.0",
"hast-util-to-text": "^4.0.0",
"hast-util-whitespace": "^3.0.0",
"mdast-util-phrasing": "^4.0.0",
"mdast-util-to-hast": "^13.0.0",
"mdast-util-to-string": "^4.0.0",
"rehype-minify-whitespace": "^6.0.0",
"trim-trailing-lines": "^2.0.0",
"unist-util-position": "^5.0.0",
"unist-util-visit": "^5.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/hast-util-to-parse5": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/hast-util-to-parse5/-/hast-util-to-parse5-8.0.0.tgz",
@ -6185,6 +6286,20 @@
"url": "https://opencollective.com/unified"
}
},
"node_modules/rehype-minify-whitespace": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/rehype-minify-whitespace/-/rehype-minify-whitespace-6.0.2.tgz",
"integrity": "sha512-Zk0pyQ06A3Lyxhe9vGtOtzz3Z0+qZ5+7icZ/PL/2x1SHPbKao5oB/g/rlc6BCTajqBb33JcOe71Ye1oFsuYbnw==",
"license": "MIT",
"dependencies": {
"@types/hast": "^3.0.0",
"hast-util-minify-whitespace": "^1.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/rehype-parse": {
"version": "9.0.0",
"resolved": "https://registry.npmjs.org/rehype-parse/-/rehype-parse-9.0.0.tgz",
@ -7499,6 +7614,16 @@
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/trim-trailing-lines": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/trim-trailing-lines/-/trim-trailing-lines-2.1.0.tgz",
"integrity": "sha512-5UR5Biq4VlVOtzqkm2AZlgvSlDJtME46uV0br0gENbwN4l5+mMKT4b9gJKqWtuL2zAIqajGJGuvbCbcAJUZqBg==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/trough": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/trough/-/trough-2.1.0.tgz",

View File

@ -52,6 +52,7 @@
"gray-matter": "^4.0.3",
"hast-util-to-html": "^9.0.3",
"hast-util-to-jsx-runtime": "^2.3.2",
"hast-util-to-mdast": "^10.1.0",
"hast-util-to-string": "^3.0.1",
"is-absolute-url": "^4.0.1",
"js-yaml": "^4.1.0",

View File

@ -58,17 +58,11 @@ export function pageResources(
}
}
export function renderPage(
cfg: GlobalConfiguration,
slug: FullSlug,
componentData: QuartzComponentProps,
components: RenderComponents,
pageResources: StaticResources,
): string {
// make a deep copy of the tree so we don't remove the transclusion references
// for the file cached in contentMap in build.ts
const root = clone(componentData.tree) as Root
export function transcludeFinal(
root: Root,
{ cfg, fileData, allFiles }: QuartzComponentProps,
): Root {
const slug = fileData.slug!
// process transcludes in componentData
visit(root, "element", (node, _index, _parent) => {
if (node.tagName === "blockquote") {
@ -76,7 +70,7 @@ export function renderPage(
if (classNames.includes("transclude")) {
const inner = node.children[0] as Element
const transcludeTarget = inner.properties["data-slug"] as FullSlug
const page = componentData.allFiles.find((f) => f.slug === transcludeTarget)
const page = allFiles.find((f) => f.slug === transcludeTarget)
if (!page) {
return
}
@ -119,7 +113,7 @@ export function renderPage(
if (!(el.type === "element" && el.tagName.match(headerRegex))) continue
const depth = Number(el.tagName.substring(1))
// lookin for our blockref
// looking for our blockref
if (startIdx === undefined || startDepth === undefined) {
// skip until we find the blockref that matches
if (el.properties?.id === blockRef) {
@ -184,9 +178,20 @@ export function renderPage(
}
}
})
return root
}
// set componentData.tree to the edited html that has transclusions rendered
componentData.tree = root
export function renderPage(
cfg: GlobalConfiguration,
slug: FullSlug,
componentData: QuartzComponentProps,
components: RenderComponents,
pageResources: StaticResources,
): string {
// make a deep copy of the tree so we don't remove the transclusion references
// for the file cached in contentMap in build.ts
// then set componentData.tree to the edited html that has transclusions rendered
componentData.tree = transcludeFinal(clone(componentData.tree) as Root, componentData)
const {
head: Head,

View File

@ -8,3 +8,4 @@ export { Static } from "./static"
export { ComponentResources } from "./componentResources"
export { NotFoundPage } from "./404"
export { CNAME } from "./cname"
export { LLMText } from "./llm"

View File

@ -0,0 +1,317 @@
import { visit } from "unist-util-visit"
import { Root, Element, Node, Text } from "hast"
import { Blockquote, Code } from "mdast"
import { QuartzEmitterPlugin } from "../types"
import { QuartzComponentProps } from "../../components/types"
import HeaderConstructor from "../../components/Header"
import BodyConstructor from "../../components/Body"
import { pageResources, transcludeFinal } from "../../components/renderPage"
import { FullPageLayout } from "../../cfg"
import { clone, FilePath, pathToRoot } from "../../util/path"
import { write } from "./helpers"
import { toMdast, defaultHandlers as hastToMdastHandlers } from "hast-util-to-mdast"
import { toMarkdown, defaultHandlers as mdastToTextHandlers } from "mdast-util-to-markdown"
import { gfmToMarkdown } from "mdast-util-gfm"
import { InlineMath, Math, mathToMarkdown } from "mdast-util-math"
import { defaultContentPageLayout, sharedPageComponents } from "../../../quartz.layout"
import { Content } from "../../components"
import DepGraph from "../../depgraph"
export const LLMText: QuartzEmitterPlugin<Partial<FullPageLayout>> = (userOpts) => {
const opts: FullPageLayout = {
...sharedPageComponents,
...defaultContentPageLayout,
pageBody: Content(),
...userOpts,
}
const { head: Head, header, beforeBody, pageBody, afterBody, left, right, footer: Footer } = opts
const Header = HeaderConstructor()
const Body = BodyConstructor()
return {
name: "LLMText",
getQuartzComponents() {
return [
Head,
Header,
Body,
...header,
...beforeBody,
pageBody,
...afterBody,
...left,
...right,
Footer,
]
},
async getDependencyGraph() {
return new DepGraph<FilePath>()
},
async emit(ctx, content, resources): Promise<FilePath[]> {
const cfg = ctx.cfg.configuration
const fps: Promise<FilePath>[] = []
const allFiles = content.map((c) => c[1].data)
for (const [tree, file] of content) {
const slug = file.data.slug!
const externalResources = pageResources(pathToRoot(slug), resources)
const componentData: QuartzComponentProps = {
ctx,
fileData: file.data,
externalResources,
cfg,
children: [],
tree,
allFiles,
}
const root = transcludeFinal(clone(tree) as Root, componentData)
const mdast = toMdast(root, {
handlers: {
// handle ast parsed by rehype-pretty-code
figure(h, node) {
if (node.properties?.dataRehypePrettyCodeFigure !== "")
return hastToMdastHandlers.figure(h, node)
let pre: Element | undefined
let code: Element | undefined
let figcaption: Element | undefined
visit(node, "element", (el: Element) => {
if (
el.tagName === "figcaption" &&
el.properties?.dataRehypePrettyCodeTitle === ""
) {
figcaption = el
return false
}
})
visit(node, "element", (el: Element) => {
if (el.tagName === "pre") {
pre = el
return false
}
})
// Find pre, code, and figcaption elements
visit(pre as Node, "element", (el: Element) => {
if (el.tagName === "code") {
code = el
return false
}
})
if (!code || !pre) return hastToMdastHandlers.figure(h, node)
// Get language
const lang = pre.properties?.dataLanguage
// Get title from figcaption
let title = ""
if (figcaption) {
title = (figcaption.children[0] as Text)?.value
}
// Get highlighted lines
// FIX: CORRECT THE CHAIN, not work very well for now
const highlightedLines: number[] = []
// Get highlighted words
const highlightedWords: string[] = []
for (const [i, span] of code.children.entries()) {
if ((span as Element).properties?.dataHighlightedLine == "") {
highlightedLines.push(i)
}
// FIX: THIS ALSO DOESN'T WORK YET
visit(span, "element", (el: Element) => {
if (el.tagName === "mark" && el.properties?.dataHighlightedCharsMark) {
let word = ""
el.children.map((span) => {
word += ((span as Element).children[0] as Text)?.value
})
highlightedWords.push(word)
}
})
}
// Build code content from spans
let codeContent = ""
visit(code, "element", (span: Element) => {
if (span.properties?.dataLine !== undefined) {
visit(span, "text", (text: Text) => {
codeContent += text.value
})
codeContent += "\n"
}
})
// Build meta string
const meta = [
title ? `title="${title}"` : "",
highlightedLines.length ? `{${highlightedLines.join(",")}}` : "",
highlightedWords.length ? `/${highlightedWords.join("/")}/` : "",
]
.filter(Boolean)
.join(" ")
const result: Code = {
type: "code",
lang: (lang as string | null) ?? null,
meta: meta || null,
value: codeContent.trimEnd(),
}
h.patch(node, result)
return result
},
// handle math node correctly
span(h, node) {
const classNames = (node.properties.className ?? []) as string[]
// katex: inline-math, katex-display: block-math
if (classNames.includes("katex") || classNames.includes("katex-display")) {
const inline = !classNames.includes("katex-display")
let source: string | null = null
visit(node, "element", (node) => {
if (
node.tagName === "annotation" &&
node.properties?.encoding === "application/x-tex"
) {
if (node.children?.[0]?.type === "text") {
source = node.children[0].value
return false // stop traversal
}
}
})
if (!source) {
console.warn(
`[emit:ContentPage] Could not extract LaTeX source from KaTeX node (slug: ${slug})`,
)
return hastToMdastHandlers.span(h, node)
}
const results: Math | InlineMath = {
type: inline ? "inlineMath" : "math",
value: source,
}
h.patch(node, results)
return results
} else {
return hastToMdastHandlers.span(h, node)
}
},
// handle mermaid
pre(h, node) {
let codeEl: Element | undefined
visit(node, "element", (el) => {
if (
el.tagName === "code" &&
((el.properties?.className ?? []) as string[]).includes("mermaid")
) {
codeEl = el
return false
}
})
if (!codeEl) return hastToMdastHandlers.pre(h, node)
const results: Code = {
type: "code",
lang: "mermaid",
value: JSON.parse(codeEl.properties?.dataClipboard as string),
}
h.patch(node, results)
return results
},
// handle callout correctly
blockquote(h, node) {
const classNames = (node.properties?.className ?? []) as string[]
if (!classNames.includes("callout")) {
return hastToMdastHandlers.blockquote(h, node)
}
// Get callout type
const type = node.properties?.dataCallout as string
// Get title from callout-title-inner
let title = ""
let titleNode: Element | undefined
visit(node, "element", (el: Element) => {
if ((el.properties?.className as string[])?.includes("callout-title-inner")) {
titleNode = el
return false
}
})
if (titleNode) {
title = ((titleNode.children[0] as Element)?.children[0] as Text)?.value
}
// Check collapse state
const isCollapsible = classNames.includes("is-collapsible")
const isCollapsed = classNames.includes("is-collapsed")
const collapseChar = isCollapsible ? (isCollapsed ? "-" : "+") : ""
// Get remaining content
let content: any[] = []
visit(node, "element", (el: Element) => {
if ((el.properties?.className as string[])?.includes("callout-content")) {
// Convert children using default blockquote handler to maintain parsing
content = h.all(el)
return false
}
})
const result: Blockquote = {
type: "blockquote",
children: [
{
type: "paragraph",
children: [
{
type: "text",
value: `[!${type}]${collapseChar}${title ? ` ${title.trim()}` : ""}`,
data: { unescaped: true },
},
],
},
...content,
],
}
h.patch(node, result)
return result
},
},
})
const fp = write({
ctx,
content: toMarkdown(mdast, {
extensions: [
{
handlers: {
code(node, _parent, _context, _info) {
const { lang, meta, value } = node
const info = [lang, meta].filter(Boolean).join(" ")
return "```" + (info ? info + "\n" : "\n") + value + "\n```"
},
text(node, parent, context, info) {
if (node.data?.unescaped) {
return node.value
}
return mdastToTextHandlers.text(node, parent, context, info)
},
},
},
mathToMarkdown(),
gfmToMarkdown(),
],
}),
slug,
ext: ".html.md",
})
fps.push(fp)
}
return await Promise.all(fps)
},
}
}