Skip to content

Commit 34b651b

Browse files
authored
Merge pull request #92 from agent-ecosystem/fix/markdown-content-party-dom-aware-rewrite
fix(parity): DOM-aware HTML extraction and heading-line protection
2 parents b3317cd + b680f66 commit 34b651b

2 files changed

Lines changed: 221 additions & 156 deletions

File tree

src/checks/observability/markdown-content-parity.ts

Lines changed: 100 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { parse } from 'node-html-parser';
1+
import { parse, NodeType, type HTMLElement, type Node } from 'node-html-parser';
22
import { registerCheck } from '../registry.js';
33
import { fetchPage } from '../../helpers/fetch-page.js';
44
import { toHtmlUrl } from '../../helpers/to-md-urls.js';
@@ -32,12 +32,9 @@ const STRIP_TAGS = [
3232
];
3333

3434
/**
35-
* Tags that were removed at the DOM level (STRIP_TAGS). If these tag names
36-
* appear in `.text` output, they came from entity-decoded content (e.g.,
37-
* `&lt;nav&gt;` → `<nav>` in prose discussing HTML elements), not from
38-
* actual DOM elements. The text-level tag stripping regex should keep their
39-
* content rather than deleting it, so both sides produce matching text
40-
* after normalize() strips the angle brackets.
35+
* Tag names corresponding to STRIP_TAGS, used by the DOM walker to skip
36+
* these elements if they reappear inside re-parsed <pre> content (e.g.,
37+
* a stray <style> block injected by a CSS-in-JS library).
4138
*/
4239
const DOM_STRIPPED_TAGS = new Set(STRIP_TAGS);
4340

@@ -91,122 +88,6 @@ interface PageParityResult {
9188
error?: string;
9289
}
9390

94-
/**
95-
* Known HTML tag names used to distinguish real tags from angle-bracket
96-
* placeholders like <YOUR_API_KEY> or <clusterName> in code examples.
97-
* Only needs to cover tags that appear in node-html-parser's .text output
98-
* (i.e., tags inside <pre> that survive as raw text).
99-
*/
100-
const HTML_TAG_NAMES = new Set([
101-
'a',
102-
'abbr',
103-
'address',
104-
'article',
105-
'aside',
106-
'audio',
107-
'b',
108-
'bdi',
109-
'bdo',
110-
'blockquote',
111-
'body',
112-
'br',
113-
'button',
114-
'canvas',
115-
'caption',
116-
'cite',
117-
'code',
118-
'col',
119-
'colgroup',
120-
'data',
121-
'dd',
122-
'del',
123-
'details',
124-
'dfn',
125-
'dialog',
126-
'div',
127-
'dl',
128-
'dt',
129-
'em',
130-
'embed',
131-
'fieldset',
132-
'figcaption',
133-
'figure',
134-
'footer',
135-
'form',
136-
'h1',
137-
'h2',
138-
'h3',
139-
'h4',
140-
'h5',
141-
'h6',
142-
'head',
143-
'header',
144-
'hr',
145-
'html',
146-
'i',
147-
'iframe',
148-
'img',
149-
'input',
150-
'ins',
151-
'kbd',
152-
'label',
153-
'legend',
154-
'li',
155-
'link',
156-
'main',
157-
'map',
158-
'mark',
159-
'meta',
160-
'meter',
161-
'nav',
162-
'noscript',
163-
'object',
164-
'ol',
165-
'optgroup',
166-
'option',
167-
'output',
168-
'p',
169-
'param',
170-
'picture',
171-
'pre',
172-
'progress',
173-
'q',
174-
'rp',
175-
'rt',
176-
'ruby',
177-
's',
178-
'samp',
179-
'script',
180-
'section',
181-
'select',
182-
'slot',
183-
'small',
184-
'source',
185-
'span',
186-
'strong',
187-
'style',
188-
'sub',
189-
'summary',
190-
'sup',
191-
'table',
192-
'tbody',
193-
'td',
194-
'template',
195-
'textarea',
196-
'tfoot',
197-
'th',
198-
'thead',
199-
'time',
200-
'title',
201-
'tr',
202-
'track',
203-
'u',
204-
'ul',
205-
'var',
206-
'video',
207-
'wbr',
208-
]);
209-
21091
/** Block-level HTML elements that should produce line breaks in extracted text. */
21192
const BLOCK_TAGS = new Set([
21293
'p',
@@ -359,34 +240,67 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
359240
}
360241
}
361242

362-
// Insert newlines before block-level elements so .text produces
363-
// separated lines instead of smashing paragraphs together
364-
for (const tag of BLOCK_TAGS) {
365-
for (const el of content.querySelectorAll(tag)) {
366-
el.insertAdjacentHTML('beforebegin', '\n');
367-
el.insertAdjacentHTML('afterend', '\n');
368-
}
243+
// Walk the DOM to produce text. Doing this ourselves (instead of relying
244+
// on .text) lets us handle two cases that flat-text + regex stripping
245+
// can't disambiguate:
246+
//
247+
// 1. node-html-parser treats <pre> content as a single raw-text node, so
248+
// syntax-highlighter markup inside (<span class="kw">, <div class="line">,
249+
// <code class="lang-js">) appears as literal text. We re-parse that
250+
// rawText as HTML and walk the resulting subtree, which yields just the
251+
// code's textContent without any markup leaking through.
252+
//
253+
// 2. Inline `<code>` mentions in prose (rendered as <code>&lt;code&gt;</code>
254+
// from a `\`<code>\`` markdown span) decode to literal `<code>` text. The
255+
// DOM walk preserves that as text; normalize() then strips the angle
256+
// brackets so it matches the markdown side. Previously the text-level
257+
// tag-stripping regex deleted these as if they were tags.
258+
const text = walkContent(content);
259+
return { text, segmentationStripped };
260+
}
261+
262+
/**
263+
* Walk a DOM subtree and emit text content with newlines around block
264+
* elements. Used by extractHtmlText.
265+
*/
266+
function walkContent(node: HTMLElement): string {
267+
let out = '';
268+
for (const child of node.childNodes) {
269+
out += walkNode(child);
369270
}
271+
return out;
272+
}
370273

371-
// node-html-parser treats <pre> content as raw text, so <style> tags
372-
// injected inside code blocks (e.g., Emotion CSS-in-JS / Leafygreen)
373-
// survive DOM-level stripping. Remove <style>...</style> blocks first,
374-
// inject newlines before <div tags to separate code lines (e.g.,
375-
// Expressive Code / Shiki use <div class="ec-line"> inside <pre>),
376-
// then strip HTML tags while preserving angle-bracket placeholders
377-
// like <YOUR_API_KEY> or <clusterName> (decoded from &lt;...&gt; entities).
378-
const text = content.text
379-
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
380-
.replace(/<!--[\s\S]*?-->/g, '')
381-
.replace(/<div[\s>]/gi, '\n<div ')
382-
.replace(/<\/[^>\s]+>/g, '')
383-
.replace(/<([a-zA-Z][a-zA-Z0-9-]*)([^>]*)>/g, (_match, tag, rest) => {
384-
const lower = tag.toLowerCase();
385-
if (DOM_STRIPPED_TAGS.has(lower)) return tag;
386-
if (HTML_TAG_NAMES.has(lower)) return '';
387-
return tag + rest;
388-
});
389-
return { text, segmentationStripped };
274+
function walkNode(node: Node): string {
275+
if (node.nodeType === NodeType.TEXT_NODE) {
276+
// text getter decodes entities (&lt; -> <, &amp; -> &)
277+
return node.text;
278+
}
279+
if (node.nodeType !== NodeType.ELEMENT_NODE) {
280+
// Skip comments and anything else
281+
return '';
282+
}
283+
const el = node as HTMLElement;
284+
const tag = el.tagName?.toLowerCase();
285+
if (!tag) return walkContent(el);
286+
287+
// Defensive: even though STRIP_TAGS removes these at DOM level above,
288+
// re-parsed <pre> content can re-introduce script/style/etc. as elements,
289+
// so skip them here too.
290+
if (DOM_STRIPPED_TAGS.has(tag)) return '';
291+
292+
if (tag === 'pre') {
293+
// node-html-parser parses <pre> content as a single raw text node, so
294+
// any inner markup (syntax-highlighter spans/divs/code) is opaque.
295+
// Re-parse the rawText to expose that markup as DOM nodes, then walk.
296+
const reparsed = parse(el.rawText);
297+
return '\n' + walkContent(reparsed) + '\n';
298+
}
299+
300+
if (BLOCK_TAGS.has(tag)) {
301+
return '\n' + walkContent(el) + '\n';
302+
}
303+
return walkContent(el);
390304
}
391305

392306
/**
@@ -399,6 +313,12 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
399313
* preserves the literal text inside <pre><code> and <code> tags. The
400314
* placeholder approach hides code content from the stripping regexes,
401315
* then restores it after all stripping is done.
316+
*
317+
* Heading lines are also placeholder-protected: a heading like
318+
* "### 1. How well..." has the "1. " stripped by the numbered-list regex
319+
* if processed normally, even though that "1. " is part of the heading
320+
* text on the HTML side. Protecting heading content keeps the bullet/
321+
* numbered-list passes from touching it.
402322
*/
403323
function extractMarkdownText(markdown: string): string {
404324
let text = markdown;
@@ -457,20 +377,44 @@ function extractMarkdownText(markdown: string): string {
457377
return `\x00CODE${idx}\x00`;
458378
});
459379

460-
// Step 3: Strip markdown formatting on non-code text
380+
// Step 3: Protect heading lines from list-marker stripping. Headings
381+
// like "### 1. How well are X supported?" survive into the HTML as
382+
// "<h3>1. How well are X supported?</h3>", so the leading "1. " is
383+
// part of the heading text — not a list marker. Without this, the
384+
// numbered-list regex would strip it and the markdown side wouldn't
385+
// contain the HTML segment.
386+
const headings: string[] = [];
387+
text = text.replace(/^#{1,6}\s+(.*)$/gm, (_match, content) => {
388+
const idx = headings.length;
389+
headings.push(content);
390+
return `\x00HEAD${idx}\x00`;
391+
});
392+
393+
// Step 4: Strip list markers and setext underlines while heading lines
394+
// are still placeholder-protected. These are the passes that would
395+
// misinterpret heading text — e.g., the numbered-list regex stripping
396+
// "1. " from "### 1. How well..." (issue #91).
461397
text = text
462-
// Remove heading markers
463-
.replace(/^#{1,6}\s+/gm, '')
464398
// Remove setext-style heading underlines
465399
.replace(/^[=-]+$/gm, '')
466-
// Remove link/image URLs, keep text: [text](url) → text
467-
.replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1')
468400
// Remove reference-style link definitions
469401
.replace(/^\[.*?\]:\s+.*$/gm, '')
470402
// Remove list bullets/numbers (before emphasis, so leading * isn't
471403
// misinterpreted as an emphasis marker)
472404
.replace(/^[\s]*[-*+]\s+/gm, '')
473-
.replace(/^[\s]*\d+\.\s+/gm, '')
405+
.replace(/^[\s]*\d+\.\s+/gm, '');
406+
407+
// Step 5: Restore heading text. From here on, heading content is
408+
// processed like any other body text — emphasis, links, etc. inside
409+
// heading text gets the same treatment so it matches the HTML side
410+
// (where <h1><em>Foo</em></h1> renders as "Foo").
411+
// eslint-disable-next-line no-control-regex
412+
text = text.replace(/\x00HEAD(\d+)\x00/g, (_match, idxStr) => headings[parseInt(idxStr, 10)]);
413+
414+
// Step 6: Strip remaining markdown formatting on body and heading text.
415+
text = text
416+
// Remove link/image URLs, keep text: [text](url) → text
417+
.replace(/!?\[([^\]]*)\]\([^)]*\)/g, '$1')
474418
// Remove emphasis markers. * emphasis is stripped unconditionally.
475419
// _ emphasis is stripped only at word boundaries (per CommonMark,
476420
// _text_ is emphasis only when _ is not adjacent to an alphanumeric).
@@ -483,7 +427,7 @@ function extractMarkdownText(markdown: string): string {
483427
// Remove horizontal rules
484428
.replace(/^[-*_]{3,}$/gm, '');
485429

486-
// Step 4: Restore code content (without backticks/fence markers)
430+
// Step 7: Restore code content (without backticks/fence markers).
487431
// eslint-disable-next-line no-control-regex
488432
text = text.replace(/\x00CODE(\d+)\x00/g, (_match, idxStr) => codeSpans[parseInt(idxStr, 10)]);
489433
// eslint-disable-next-line no-control-regex

0 commit comments

Comments
 (0)