diff --git a/packages/website/docs/v4/askai-markdown-indexing.mdx b/packages/website/docs/v4/askai-markdown-indexing.mdx index 5b645ebde..c95b07337 100644 --- a/packages/website/docs/v4/askai-markdown-indexing.mdx +++ b/packages/website/docs/v4/askai-markdown-indexing.mdx @@ -82,19 +82,28 @@ For users who need advanced customization or want to understand the underlying c indexName: "my-markdown-index", pathsToMatch: ["https://example.com/docs/**"], recordExtractor: ({ $, url, helpers }) => { - const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.) + // Target only the main content, excluding navigation + const text = helpers.markdown( + "main > *:not(nav):not(header):not(.breadcrumb)", + ); + if (text === "") return []; - // Extract language or other attributes as needed. Optional const language = $("html").attr("lang") || "en"; + const title = $("head > title").text(); + + // Get the main heading for better searchability + const h1 = $("main h1").first().text(); + return helpers.splitTextIntoRecords({ text, baseRecord: { url, objectID: url, - title: $("head > title").text(), - lang: language, // Add more attributes as needed + title: title || h1, + heading: h1, // Add main heading as separate field + lang: language, }, maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records. // Note: Increasing this value may increase the token count for LLMs, which can affect context size and cost. @@ -110,12 +119,15 @@ For users who need advanced customization or want to understand the underlying c ```js // initialIndexSettings: { ..., "my-markdown-index": { - attributesForFaceting: ["lang"], // Add more if you extract more attributes + attributesForFaceting: ["lang"], ignorePlurals: true, - minProximity: 4, + minProximity: 1, removeStopWords: false, - searchableAttributes: ["unordered(title)", "unordered(text)"], - removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback. + searchableAttributes: ["title", "heading", "unordered(text)"], + removeWordsIfNoResults: "lastWords", + attributesToHighlight: ["title", "text"], + typoTolerance: false, + advancedSyntax: false, }, // ...}, ``` @@ -397,20 +409,28 @@ import TabItem from '@theme/TabItem'; indexName: "my-markdown-index", pathsToMatch: ["https://example.com/**"], recordExtractor: ({ $, url, helpers }) => { - const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.) + // Target only the main content, excluding navigation + const text = helpers.markdown( + "main > *:not(nav):not(header):not(.breadcrumb)", + ); + if (text === "") return []; - // Customize selectors or meta extraction as needed. Optional const language = $("html").attr("lang") || "en"; + const title = $("head > title").text(); + + // Get the main heading for better searchability + const h1 = $("main h1").first().text(); + return helpers.splitTextIntoRecords({ text, baseRecord: { url, objectID: url, - title: $("head > title").text(), - // Add more optional attributes to the record - lang: language + title: title || h1, + heading: h1, // Add main heading as separate field + lang: language, }, maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records. // Note: Increasing this value may increase the token count for LLMs, which can affect context size and cost. @@ -424,10 +444,13 @@ import TabItem from '@theme/TabItem'; "my-markdown-index": { attributesForFaceting: ["lang"], // Recommended if you add more attributes outside of objectID ignorePlurals: true, - minProximity: 4, + minProximity: 1, removeStopWords: false, - searchableAttributes: ["unordered(title)", "unordered(text)"], - removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback. + searchableAttributes: ["title", "heading", "unordered(text)"], + removeWordsIfNoResults: "lastWords", + attributesToHighlight: ["title", "text"], + typoTolerance: false, + advancedSyntax: false, }, // ...}, ``` @@ -446,7 +469,11 @@ import TabItem from '@theme/TabItem'; indexName: "my-markdown-index", pathsToMatch: ["https://example.com/docs/**"], recordExtractor: ({ $, url, helpers }) => { - const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.) + // Target only the main content, excluding navigation + const text = helpers.markdown( + "main > *:not(nav):not(header):not(.breadcrumb)", + ); + if (text === "") return []; // Extract meta tag values. These are required for Docusaurus @@ -457,12 +484,18 @@ import TabItem from '@theme/TabItem'; const docusaurus_tag = $('meta[name="docsearch:docusaurus_tag"]').attr("content") || ""; + const title = $("head > title").text(); + + // Get the main heading for better searchability + const h1 = $("main h1").first().text(); + return helpers.splitTextIntoRecords({ text, baseRecord: { url, objectID: url, - title: $("head > title").text(), + title: title || h1, + heading: h1, // Add main heading as separate field lang: language, // Required for Docusaurus language, // Required for Docusaurus version: version.split(","), // in case there are multiple versions. Required for Docusaurus @@ -483,10 +516,13 @@ import TabItem from '@theme/TabItem'; "my-markdown-index": { attributesForFaceting: ["lang", "language", "version", "docusaurus_tag"], // Required for Docusaurus ignorePlurals: true, - minProximity: 4, + minProximity: 1, removeStopWords: false, - searchableAttributes: ["unordered(title)", "unordered(text)"], - removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback. + searchableAttributes: ["title", "heading", "unordered(text)"], + removeWordsIfNoResults: "lastWords", + attributesToHighlight: ["title", "text"], + typoTolerance: false, + advancedSyntax: false, }, // ...}, ``` @@ -505,19 +541,27 @@ import TabItem from '@theme/TabItem'; indexName: "my-markdown-index", pathsToMatch: ["https://example.com/docs/**"], recordExtractor: ({ $, url, helpers }) => { - const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.) + // Target only the main content, excluding navigation + const text = helpers.markdown( + "main > *:not(nav):not(header):not(.breadcrumb)", + ); + if (text === "") return []; - // Extract meta tag values. These are required for VitePress const language = $("html").attr("lang") || "en"; + const title = $("head > title").text(); + + // Get the main heading for better searchability + const h1 = $("main h1").first().text(); return helpers.splitTextIntoRecords({ text, baseRecord: { url, - title: $("head > title").text(), objectID: url, + title: title || h1, + heading: h1, // Add main heading as separate field lang: language, // Required for VitePress }, maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records. @@ -532,10 +576,13 @@ import TabItem from '@theme/TabItem'; "my-markdown-index": { attributesForFaceting: ["lang"], // Required for VitePress ignorePlurals: true, - minProximity: 4, + minProximity: 1, removeStopWords: false, - searchableAttributes: ["unordered(title)", "unordered(text)"], - removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback. + searchableAttributes: ["title", "heading", "unordered(text)"], + removeWordsIfNoResults: "lastWords", + attributesToHighlight: ["title", "text"], + typoTolerance: false, + advancedSyntax: false, }, // ...}, ``` @@ -554,19 +601,27 @@ import TabItem from '@theme/TabItem'; indexName: "my-markdown-index", pathsToMatch: ["https://example.com/docs/**"], recordExtractor: ({ $, url, helpers }) => { - const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.) + // Target only the main content, excluding navigation + const text = helpers.markdown( + "main > *:not(nav):not(header):not(.breadcrumb)", + ); + if (text === "") return []; - // Extract meta tag values. These are required for Astro/StarLight const language = $("html").attr("lang") || "en"; + const title = $("head > title").text(); + + // Get the main heading for better searchability + const h1 = $("main h1").first().text(); return helpers.splitTextIntoRecords({ text, baseRecord: { url, - title: $("head > title").text(), objectID: url, + title: title || h1, + heading: h1, // Add main heading as separate field lang: language, // Required for Astro/StarLight }, maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records. @@ -581,10 +636,13 @@ import TabItem from '@theme/TabItem'; "my-markdown-index": { attributesForFaceting: ["lang"], // Required for Astro/StarLight ignorePlurals: true, - minProximity: 4, + minProximity: 1, removeStopWords: false, - searchableAttributes: ["unordered(title)", "unordered(text)"], - removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback. + searchableAttributes: ["title", "heading", "unordered(text)"], + removeWordsIfNoResults: "lastWords", + attributesToHighlight: ["title", "text"], + typoTolerance: false, + advancedSyntax: false, }, // ...}, ```