Skip to content

Commit 9d675a3

Browse files
committed
fix: improve backtick handling in markdown parity check
1 parent 306a438 commit 9d675a3

2 files changed

Lines changed: 129 additions & 5 deletions

File tree

src/checks/observability/markdown-content-parity.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,18 @@ function extractMarkdownText(markdown: string): string {
439439
codeSpans.push(trimmed);
440440
return `\x00CODE${idx}\x00`;
441441
});
442-
text = text.replace(/(?<!`)`([^`]+)`(?!`)/g, (_match, content) => {
442+
// Single-backtick spans: content may include backtick runs of length != 1
443+
// (e.g. ` ``` ` where the triple backtick is content, not a closer).
444+
// Both opening and closing delimiters require (?<!`) and (?!`) to ensure
445+
// they are standalone single backticks, not part of a multi-backtick run.
446+
// This prevents bare ``` in prose from cascading into distant backtick pairing.
447+
text = text.replace(/(?<!`)`(?!`)((?:[^`]|`{2,})+)(?<!`)`(?!`)/g, (_match, content) => {
443448
const idx = codeSpans.length;
444-
codeSpans.push(content);
449+
let trimmed = content;
450+
if (trimmed.startsWith(' ') && trimmed.endsWith(' ') && trimmed.trim().length > 0) {
451+
trimmed = trimmed.slice(1, -1);
452+
}
453+
codeSpans.push(trimmed);
445454
return `\x00CODE${idx}\x00`;
446455
});
447456

@@ -459,10 +468,13 @@ function extractMarkdownText(markdown: string): string {
459468
// misinterpreted as an emphasis marker)
460469
.replace(/^[\s]*[-*+]\s+/gm, '')
461470
.replace(/^[\s]*\d+\.\s+/gm, '')
462-
// Remove emphasis markers (* only — underscores are too common in
463-
// code identifiers like mongoc_client_get_database and cause false
464-
// mismatches when stripped as emphasis)
471+
// Remove emphasis markers. * emphasis is stripped unconditionally.
472+
// _ emphasis is stripped only at word boundaries (per CommonMark,
473+
// _text_ is emphasis only when _ is not adjacent to an alphanumeric).
474+
// This preserves code identifiers like mongoc_client_get_database
475+
// that appear as plain text (not inside backticks).
465476
.replace(/(\*{1,3})(.*?)\1/g, '$2')
477+
.replace(/(?<!\w)(_{1,3})(.*?)\1(?!\w)/g, '$2')
466478
// Remove blockquote markers
467479
.replace(/^>\s?/gm, '')
468480
// Remove horizontal rules

test/unit/checks/markdown-content-parity.test.ts

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,6 +1842,118 @@ Use a linter to automatically verify code fence syntax across all your documents
18421842
expect(pageResults[0].missingSegments).toBe(0);
18431843
});
18441844

1845+
it('handles single-backtick code spans containing literal backticks', async () => {
1846+
// CommonMark allows ` ``` ` (single-backtick delimiters with triple-backtick
1847+
// content). The triple backtick inside is not a closer because it's a
1848+
// backtick string of length 3, not length 1. The regex must match this
1849+
// as a valid code span without cascading into distant backtick pairing.
1850+
const html = `<html><body>
1851+
<h1>Code Fence Syntax</h1>
1852+
<p>A backtick fence (<code>\`\`\`</code>) can only be closed by another backtick fence.</p>
1853+
<p>A tilde fence (<code>~~~</code>) closing a backtick-opened fence leaves it unclosed.</p>
1854+
<p>Run with <code>--verbose</code> to see which pages have unclosed code fences today.</p>
1855+
<p>Ensure every opening <code>\`\`\`</code> or <code>~~~</code> has a matching closing fence.</p>
1856+
<p>Nested code examples are the most common source of fence mismatches in practice.</p>
1857+
<p>The check validates all code fences across every page discovered during the scan.</p>
1858+
<p>CommonMark requires the closing delimiter to be the same type as the opening one.</p>
1859+
<p>Indented code blocks do not require explicit delimiters but are harder to maintain.</p>
1860+
<p>Some editors provide visual indicators for unclosed fences to help catch mistakes.</p>
1861+
<p>Always preview your rendered markdown to verify code blocks display correctly here.</p>
1862+
</body></html>`;
1863+
1864+
const markdown =
1865+
'# Code Fence Syntax\n\n' +
1866+
'A backtick fence (` ``` `) can only be closed by another backtick fence.\n\n' +
1867+
'A tilde fence (`~~~`) closing a backtick-opened fence leaves it unclosed.\n\n' +
1868+
'Run with `--verbose` to see which pages have unclosed code fences today.\n\n' +
1869+
'Ensure every opening ` ``` ` or `~~~` has a matching closing fence.\n\n' +
1870+
'Nested code examples are the most common source of fence mismatches in practice.\n\n' +
1871+
'The check validates all code fences across every page discovered during the scan.\n\n' +
1872+
'CommonMark requires the closing delimiter to be the same type as the opening one.\n\n' +
1873+
'Indented code blocks do not require explicit delimiters but are harder to maintain.\n\n' +
1874+
'Some editors provide visual indicators for unclosed fences to help catch mistakes.\n\n' +
1875+
'Always preview your rendered markdown to verify code blocks display correctly here.';
1876+
1877+
const url = 'http://mcp-singlebacktick.local/docs/fences';
1878+
1879+
server.use(
1880+
http.get(
1881+
url,
1882+
() =>
1883+
new HttpResponse(html, {
1884+
status: 200,
1885+
headers: { 'Content-Type': 'text/html' },
1886+
}),
1887+
),
1888+
);
1889+
1890+
const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-singlebacktick.local');
1891+
const result = await check.run(ctx);
1892+
expect(result.status).toBe('pass');
1893+
const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>;
1894+
expect(pageResults[0].missingSegments).toBe(0);
1895+
});
1896+
1897+
it('strips underscore emphasis at word boundaries without mangling identifiers', async () => {
1898+
// _emphasis_ in prose (underscores at word boundaries) should be stripped
1899+
// to match HTML extraction. But mongoc_client_get_database (underscores
1900+
// adjacent to word characters) must be preserved.
1901+
const html = `<html><body><main>
1902+
<h1>Database Driver Guide</h1>
1903+
<p>In empirical testing, soft 404s performed <em>worse</em> than real 404s for agents.</p>
1904+
<p>Agents do <em>not</em> read your navigation structure when fetching documentation.</p>
1905+
<p>Use mongoc_client_get_database to obtain a database handle from the client object.</p>
1906+
<p>The mongoc_cursor_next function advances the cursor to the <em>next</em> result document.</p>
1907+
<p>This behavior is <em>intentional</em> and matches the CommonMark specification exactly.</p>
1908+
<p>Release resources with mongoc_client_destroy when the client is no longer needed here.</p>
1909+
<p>The response includes a <em>detailed</em> error message explaining what went wrong here.</p>
1910+
<p>Call mongoc_collection_find_with_opts to query documents with filter options today.</p>
1911+
<p>Documentation should be <em>clear</em> and concise for both humans and agents reading.</p>
1912+
<p>The mongoc_collection_insert_one function inserts a single document into collection.</p>
1913+
</main></body></html>`;
1914+
1915+
const markdown = `# Database Driver Guide
1916+
1917+
In empirical testing, soft 404s performed _worse_ than real 404s for agents.
1918+
1919+
Agents do _not_ read your navigation structure when fetching documentation.
1920+
1921+
Use mongoc_client_get_database to obtain a database handle from the client object.
1922+
1923+
The mongoc_cursor_next function advances the cursor to the _next_ result document.
1924+
1925+
This behavior is _intentional_ and matches the CommonMark specification exactly.
1926+
1927+
Release resources with mongoc_client_destroy when the client is no longer needed here.
1928+
1929+
The response includes a _detailed_ error message explaining what went wrong here.
1930+
1931+
Call mongoc_collection_find_with_opts to query documents with filter options today.
1932+
1933+
Documentation should be _clear_ and concise for both humans and agents reading.
1934+
1935+
The mongoc_collection_insert_one function inserts a single document into collection.`;
1936+
1937+
const url = 'http://mcp-underscore-emphasis.local/docs/driver';
1938+
1939+
server.use(
1940+
http.get(
1941+
url,
1942+
() =>
1943+
new HttpResponse(html, {
1944+
status: 200,
1945+
headers: { 'Content-Type': 'text/html' },
1946+
}),
1947+
),
1948+
);
1949+
1950+
const ctx = makeCtx([{ url, markdown, htmlBody: html }], 'mcp-underscore-emphasis.local');
1951+
const result = await check.run(ctx);
1952+
expect(result.status).toBe('pass');
1953+
const pageResults = result.details?.pageResults as Array<{ missingSegments: number }>;
1954+
expect(pageResults[0].missingSegments).toBe(0);
1955+
});
1956+
18451957
// --- Audience segmentation tests ---
18461958

18471959
it('strips data-markdown-ignore elements from HTML before comparison', async () => {

0 commit comments

Comments
 (0)