Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 91 additions & 33 deletions packages/website/docs/v4/askai-markdown-indexing.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,28 @@ For users who need advanced customization or want to understand the underlying c
indexName: "my-markdown-index",
pathsToMatch: ["https://example.com/docs/**"],
recordExtractor: ({ $, url, helpers }) => {
const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.)
// Target only the main content, excluding navigation
const text = helpers.markdown(
"main > *:not(nav):not(header):not(.breadcrumb)",
);

if (text === "") return [];

// Extract language or other attributes as needed. Optional
const language = $("html").attr("lang") || "en";

const title = $("head > title").text();

// Get the main heading for better searchability
const h1 = $("main h1").first().text();

return helpers.splitTextIntoRecords({
text,
baseRecord: {
url,
objectID: url,
title: $("head > title").text(),
lang: language, // Add more attributes as needed
title: title || h1,
heading: h1, // Add main heading as separate field
lang: language,
},
maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records.
// Note: Increasing this value may increase the token count for LLMs, which can affect context size and cost.
Expand All @@ -110,12 +119,15 @@ For users who need advanced customization or want to understand the underlying c
```js
// initialIndexSettings: { ...,
"my-markdown-index": {
attributesForFaceting: ["lang"], // Add more if you extract more attributes
attributesForFaceting: ["lang"],
ignorePlurals: true,
minProximity: 4,
minProximity: 1,
removeStopWords: false,
searchableAttributes: ["unordered(title)", "unordered(text)"],
removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback.
searchableAttributes: ["title", "heading", "unordered(text)"],
removeWordsIfNoResults: "lastWords",
attributesToHighlight: ["title", "text"],
typoTolerance: false,
advancedSyntax: false,
},
// ...},
```
Expand Down Expand Up @@ -397,20 +409,28 @@ import TabItem from '@theme/TabItem';
indexName: "my-markdown-index",
pathsToMatch: ["https://example.com/**"],
recordExtractor: ({ $, url, helpers }) => {
const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.)
// Target only the main content, excluding navigation
const text = helpers.markdown(
"main > *:not(nav):not(header):not(.breadcrumb)",
);

if (text === "") return [];

// Customize selectors or meta extraction as needed. Optional
const language = $("html").attr("lang") || "en";

const title = $("head > title").text();

// Get the main heading for better searchability
const h1 = $("main h1").first().text();

return helpers.splitTextIntoRecords({
text,
baseRecord: {
url,
objectID: url,
title: $("head > title").text(),
// Add more optional attributes to the record
lang: language
title: title || h1,
heading: h1, // Add main heading as separate field
lang: language,
},
maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records.
// Note: Increasing this value may increase the token count for LLMs, which can affect context size and cost.
Expand All @@ -424,10 +444,13 @@ import TabItem from '@theme/TabItem';
"my-markdown-index": {
attributesForFaceting: ["lang"], // Recommended if you add more attributes outside of objectID
ignorePlurals: true,
minProximity: 4,
minProximity: 1,
removeStopWords: false,
searchableAttributes: ["unordered(title)", "unordered(text)"],
removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback.
searchableAttributes: ["title", "heading", "unordered(text)"],
removeWordsIfNoResults: "lastWords",
attributesToHighlight: ["title", "text"],
typoTolerance: false,
advancedSyntax: false,
},
// ...},
```
Expand All @@ -446,7 +469,11 @@ import TabItem from '@theme/TabItem';
indexName: "my-markdown-index",
pathsToMatch: ["https://example.com/docs/**"],
recordExtractor: ({ $, url, helpers }) => {
const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.)
// Target only the main content, excluding navigation
const text = helpers.markdown(
"main > *:not(nav):not(header):not(.breadcrumb)",
);

if (text === "") return [];

// Extract meta tag values. These are required for Docusaurus
Expand All @@ -457,12 +484,18 @@ import TabItem from '@theme/TabItem';
const docusaurus_tag =
$('meta[name="docsearch:docusaurus_tag"]').attr("content") || "";

const title = $("head > title").text();

// Get the main heading for better searchability
const h1 = $("main h1").first().text();

return helpers.splitTextIntoRecords({
text,
baseRecord: {
url,
objectID: url,
title: $("head > title").text(),
title: title || h1,
heading: h1, // Add main heading as separate field
lang: language, // Required for Docusaurus
language, // Required for Docusaurus
version: version.split(","), // in case there are multiple versions. Required for Docusaurus
Expand All @@ -483,10 +516,13 @@ import TabItem from '@theme/TabItem';
"my-markdown-index": {
attributesForFaceting: ["lang", "language", "version", "docusaurus_tag"], // Required for Docusaurus
ignorePlurals: true,
minProximity: 4,
minProximity: 1,
removeStopWords: false,
searchableAttributes: ["unordered(title)", "unordered(text)"],
removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback.
searchableAttributes: ["title", "heading", "unordered(text)"],
removeWordsIfNoResults: "lastWords",
attributesToHighlight: ["title", "text"],
typoTolerance: false,
advancedSyntax: false,
},
// ...},
```
Expand All @@ -505,19 +541,27 @@ import TabItem from '@theme/TabItem';
indexName: "my-markdown-index",
pathsToMatch: ["https://example.com/docs/**"],
recordExtractor: ({ $, url, helpers }) => {
const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.)
// Target only the main content, excluding navigation
const text = helpers.markdown(
"main > *:not(nav):not(header):not(.breadcrumb)",
);

if (text === "") return [];

// Extract meta tag values. These are required for VitePress
const language = $("html").attr("lang") || "en";

const title = $("head > title").text();

// Get the main heading for better searchability
const h1 = $("main h1").first().text();

return helpers.splitTextIntoRecords({
text,
baseRecord: {
url,
title: $("head > title").text(),
objectID: url,
title: title || h1,
heading: h1, // Add main heading as separate field
lang: language, // Required for VitePress
},
maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records.
Expand All @@ -532,10 +576,13 @@ import TabItem from '@theme/TabItem';
"my-markdown-index": {
attributesForFaceting: ["lang"], // Required for VitePress
ignorePlurals: true,
minProximity: 4,
minProximity: 1,
removeStopWords: false,
searchableAttributes: ["unordered(title)", "unordered(text)"],
removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback.
searchableAttributes: ["title", "heading", "unordered(text)"],
removeWordsIfNoResults: "lastWords",
attributesToHighlight: ["title", "text"],
typoTolerance: false,
advancedSyntax: false,
},
// ...},
```
Expand All @@ -554,19 +601,27 @@ import TabItem from '@theme/TabItem';
indexName: "my-markdown-index",
pathsToMatch: ["https://example.com/docs/**"],
recordExtractor: ({ $, url, helpers }) => {
const text = helpers.markdown("main"); // Change "main" to match your content tag (e.g., "main", "article", etc.)
// Target only the main content, excluding navigation
const text = helpers.markdown(
"main > *:not(nav):not(header):not(.breadcrumb)",
);

if (text === "") return [];

// Extract meta tag values. These are required for Astro/StarLight
const language = $("html").attr("lang") || "en";

const title = $("head > title").text();

// Get the main heading for better searchability
const h1 = $("main h1").first().text();

return helpers.splitTextIntoRecords({
text,
baseRecord: {
url,
title: $("head > title").text(),
objectID: url,
title: title || h1,
heading: h1, // Add main heading as separate field
lang: language, // Required for Astro/StarLight
},
maxRecordBytes: 100000, // Higher = fewer, larger records. Lower = more, smaller records.
Expand All @@ -581,10 +636,13 @@ import TabItem from '@theme/TabItem';
"my-markdown-index": {
attributesForFaceting: ["lang"], // Required for Astro/StarLight
ignorePlurals: true,
minProximity: 4,
minProximity: 1,
removeStopWords: false,
searchableAttributes: ["unordered(title)", "unordered(text)"],
removeWordsIfNoResults: "allOptional" // This will help if the LLM finds no results. A graceful fallback.
searchableAttributes: ["title", "heading", "unordered(text)"],
removeWordsIfNoResults: "lastWords",
attributesToHighlight: ["title", "text"],
typoTolerance: false,
advancedSyntax: false,
},
// ...},
```
Expand Down