1- import { parse } from 'node-html-parser' ;
1+ import { parse , NodeType , type HTMLElement , type Node } from 'node-html-parser' ;
22import { registerCheck } from '../registry.js' ;
33import { fetchPage } from '../../helpers/fetch-page.js' ;
44import { toHtmlUrl } from '../../helpers/to-md-urls.js' ;
@@ -32,12 +32,9 @@ const STRIP_TAGS = [
3232] ;
3333
3434/**
35- * Tags that were removed at the DOM level (STRIP_TAGS). If these tag names
36- * appear in `.text` output, they came from entity-decoded content (e.g.,
37- * `<nav>` → `<nav>` in prose discussing HTML elements), not from
38- * actual DOM elements. The text-level tag stripping regex should keep their
39- * content rather than deleting it, so both sides produce matching text
40- * after normalize() strips the angle brackets.
35+ * Tag names corresponding to STRIP_TAGS, used by the DOM walker to skip
36+ * these elements if they reappear inside re-parsed <pre> content (e.g.,
37+ * a stray <style> block injected by a CSS-in-JS library).
4138 */
4239const DOM_STRIPPED_TAGS = new Set ( STRIP_TAGS ) ;
4340
@@ -91,122 +88,6 @@ interface PageParityResult {
9188 error ?: string ;
9289}
9390
94- /**
95- * Known HTML tag names used to distinguish real tags from angle-bracket
96- * placeholders like <YOUR_API_KEY> or <clusterName> in code examples.
97- * Only needs to cover tags that appear in node-html-parser's .text output
98- * (i.e., tags inside <pre> that survive as raw text).
99- */
100- const HTML_TAG_NAMES = new Set ( [
101- 'a' ,
102- 'abbr' ,
103- 'address' ,
104- 'article' ,
105- 'aside' ,
106- 'audio' ,
107- 'b' ,
108- 'bdi' ,
109- 'bdo' ,
110- 'blockquote' ,
111- 'body' ,
112- 'br' ,
113- 'button' ,
114- 'canvas' ,
115- 'caption' ,
116- 'cite' ,
117- 'code' ,
118- 'col' ,
119- 'colgroup' ,
120- 'data' ,
121- 'dd' ,
122- 'del' ,
123- 'details' ,
124- 'dfn' ,
125- 'dialog' ,
126- 'div' ,
127- 'dl' ,
128- 'dt' ,
129- 'em' ,
130- 'embed' ,
131- 'fieldset' ,
132- 'figcaption' ,
133- 'figure' ,
134- 'footer' ,
135- 'form' ,
136- 'h1' ,
137- 'h2' ,
138- 'h3' ,
139- 'h4' ,
140- 'h5' ,
141- 'h6' ,
142- 'head' ,
143- 'header' ,
144- 'hr' ,
145- 'html' ,
146- 'i' ,
147- 'iframe' ,
148- 'img' ,
149- 'input' ,
150- 'ins' ,
151- 'kbd' ,
152- 'label' ,
153- 'legend' ,
154- 'li' ,
155- 'link' ,
156- 'main' ,
157- 'map' ,
158- 'mark' ,
159- 'meta' ,
160- 'meter' ,
161- 'nav' ,
162- 'noscript' ,
163- 'object' ,
164- 'ol' ,
165- 'optgroup' ,
166- 'option' ,
167- 'output' ,
168- 'p' ,
169- 'param' ,
170- 'picture' ,
171- 'pre' ,
172- 'progress' ,
173- 'q' ,
174- 'rp' ,
175- 'rt' ,
176- 'ruby' ,
177- 's' ,
178- 'samp' ,
179- 'script' ,
180- 'section' ,
181- 'select' ,
182- 'slot' ,
183- 'small' ,
184- 'source' ,
185- 'span' ,
186- 'strong' ,
187- 'style' ,
188- 'sub' ,
189- 'summary' ,
190- 'sup' ,
191- 'table' ,
192- 'tbody' ,
193- 'td' ,
194- 'template' ,
195- 'textarea' ,
196- 'tfoot' ,
197- 'th' ,
198- 'thead' ,
199- 'time' ,
200- 'title' ,
201- 'tr' ,
202- 'track' ,
203- 'u' ,
204- 'ul' ,
205- 'var' ,
206- 'video' ,
207- 'wbr' ,
208- ] ) ;
209-
21091/** Block-level HTML elements that should produce line breaks in extracted text. */
21192const BLOCK_TAGS = new Set ( [
21293 'p' ,
@@ -359,34 +240,67 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
359240 }
360241 }
361242
362- // Insert newlines before block-level elements so .text produces
363- // separated lines instead of smashing paragraphs together
364- for ( const tag of BLOCK_TAGS ) {
365- for ( const el of content . querySelectorAll ( tag ) ) {
366- el . insertAdjacentHTML ( 'beforebegin' , '\n' ) ;
367- el . insertAdjacentHTML ( 'afterend' , '\n' ) ;
368- }
243+ // Walk the DOM to produce text. Doing this ourselves (instead of relying
244+ // on .text) lets us handle two cases that flat-text + regex stripping
245+ // can't disambiguate:
246+ //
247+ // 1. node-html-parser treats <pre> content as a single raw-text node, so
248+ // syntax-highlighter markup inside (<span class="kw">, <div class="line">,
249+ // <code class="lang-js">) appears as literal text. We re-parse that
250+ // rawText as HTML and walk the resulting subtree, which yields just the
251+ // code's textContent without any markup leaking through.
252+ //
253+ // 2. Inline `<code>` mentions in prose (rendered as <code><code></code>
254+ // from a `\`<code>\`` markdown span) decode to literal `<code>` text. The
255+ // DOM walk preserves that as text; normalize() then strips the angle
256+ // brackets so it matches the markdown side. Previously the text-level
257+ // tag-stripping regex deleted these as if they were tags.
258+ const text = walkContent ( content ) ;
259+ return { text, segmentationStripped } ;
260+ }
261+
262+ /**
263+ * Walk a DOM subtree and emit text content with newlines around block
264+ * elements. Used by extractHtmlText.
265+ */
266+ function walkContent ( node : HTMLElement ) : string {
267+ let out = '' ;
268+ for ( const child of node . childNodes ) {
269+ out += walkNode ( child ) ;
369270 }
271+ return out ;
272+ }
370273
371- // node-html-parser treats <pre> content as raw text, so <style> tags
372- // injected inside code blocks (e.g., Emotion CSS-in-JS / Leafygreen)
373- // survive DOM-level stripping. Remove <style>...</style> blocks first,
374- // inject newlines before <div tags to separate code lines (e.g.,
375- // Expressive Code / Shiki use <div class="ec-line"> inside <pre>),
376- // then strip HTML tags while preserving angle-bracket placeholders
377- // like <YOUR_API_KEY> or <clusterName> (decoded from <...> entities).
378- const text = content . text
379- . replace ( / < s t y l e [ ^ > ] * > [ \s \S ] * ?< \/ s t y l e > / gi, '' )
380- . replace ( / < ! - - [ \s \S ] * ?- - > / g, '' )
381- . replace ( / < d i v [ \s > ] / gi, '\n<div ' )
382- . replace ( / < \/ [ ^ > \s ] + > / g, '' )
383- . replace ( / < ( [ a - z A - Z ] [ a - z A - Z 0 - 9 - ] * ) ( [ ^ > ] * ) > / g, ( _match , tag , rest ) => {
384- const lower = tag . toLowerCase ( ) ;
385- if ( DOM_STRIPPED_TAGS . has ( lower ) ) return tag ;
386- if ( HTML_TAG_NAMES . has ( lower ) ) return '' ;
387- return tag + rest ;
388- } ) ;
389- return { text, segmentationStripped } ;
274+ function walkNode ( node : Node ) : string {
275+ if ( node . nodeType === NodeType . TEXT_NODE ) {
276+ // text getter decodes entities (< -> <, & -> &)
277+ return node . text ;
278+ }
279+ if ( node . nodeType !== NodeType . ELEMENT_NODE ) {
280+ // Skip comments and anything else
281+ return '' ;
282+ }
283+ const el = node as HTMLElement ;
284+ const tag = el . tagName ?. toLowerCase ( ) ;
285+ if ( ! tag ) return walkContent ( el ) ;
286+
287+ // Defensive: even though STRIP_TAGS removes these at DOM level above,
288+ // re-parsed <pre> content can re-introduce script/style/etc. as elements,
289+ // so skip them here too.
290+ if ( DOM_STRIPPED_TAGS . has ( tag ) ) return '' ;
291+
292+ if ( tag === 'pre' ) {
293+ // node-html-parser parses <pre> content as a single raw text node, so
294+ // any inner markup (syntax-highlighter spans/divs/code) is opaque.
295+ // Re-parse the rawText to expose that markup as DOM nodes, then walk.
296+ const reparsed = parse ( el . rawText ) ;
297+ return '\n' + walkContent ( reparsed ) + '\n' ;
298+ }
299+
300+ if ( BLOCK_TAGS . has ( tag ) ) {
301+ return '\n' + walkContent ( el ) + '\n' ;
302+ }
303+ return walkContent ( el ) ;
390304}
391305
392306/**
@@ -399,6 +313,12 @@ function extractHtmlText(html: string, parityExclusions?: string[]): HtmlExtract
399313 * preserves the literal text inside <pre><code> and <code> tags. The
400314 * placeholder approach hides code content from the stripping regexes,
401315 * then restores it after all stripping is done.
316+ *
317+ * Heading lines are also placeholder-protected: a heading like
318+ * "### 1. How well..." has the "1. " stripped by the numbered-list regex
319+ * if processed normally, even though that "1. " is part of the heading
320+ * text on the HTML side. Protecting heading content keeps the bullet/
321+ * numbered-list passes from touching it.
402322 */
403323function extractMarkdownText ( markdown : string ) : string {
404324 let text = markdown ;
@@ -457,20 +377,44 @@ function extractMarkdownText(markdown: string): string {
457377 return `\x00CODE${ idx } \x00` ;
458378 } ) ;
459379
460- // Step 3: Strip markdown formatting on non-code text
380+ // Step 3: Protect heading lines from list-marker stripping. Headings
381+ // like "### 1. How well are X supported?" survive into the HTML as
382+ // "<h3>1. How well are X supported?</h3>", so the leading "1. " is
383+ // part of the heading text — not a list marker. Without this, the
384+ // numbered-list regex would strip it and the markdown side wouldn't
385+ // contain the HTML segment.
386+ const headings : string [ ] = [ ] ;
387+ text = text . replace ( / ^ # { 1 , 6 } \s + ( .* ) $ / gm, ( _match , content ) => {
388+ const idx = headings . length ;
389+ headings . push ( content ) ;
390+ return `\x00HEAD${ idx } \x00` ;
391+ } ) ;
392+
393+ // Step 4: Strip list markers and setext underlines while heading lines
394+ // are still placeholder-protected. These are the passes that would
395+ // misinterpret heading text — e.g., the numbered-list regex stripping
396+ // "1. " from "### 1. How well..." (issue #91).
461397 text = text
462- // Remove heading markers
463- . replace ( / ^ # { 1 , 6 } \s + / gm, '' )
464398 // Remove setext-style heading underlines
465399 . replace ( / ^ [ = - ] + $ / gm, '' )
466- // Remove link/image URLs, keep text: [text](url) → text
467- . replace ( / ! ? \[ ( [ ^ \] ] * ) \] \( [ ^ ) ] * \) / g, '$1' )
468400 // Remove reference-style link definitions
469401 . replace ( / ^ \[ .* ?\] : \s + .* $ / gm, '' )
470402 // Remove list bullets/numbers (before emphasis, so leading * isn't
471403 // misinterpreted as an emphasis marker)
472404 . replace ( / ^ [ \s ] * [ - * + ] \s + / gm, '' )
473- . replace ( / ^ [ \s ] * \d + \. \s + / gm, '' )
405+ . replace ( / ^ [ \s ] * \d + \. \s + / gm, '' ) ;
406+
407+ // Step 5: Restore heading text. From here on, heading content is
408+ // processed like any other body text — emphasis, links, etc. inside
409+ // heading text gets the same treatment so it matches the HTML side
410+ // (where <h1><em>Foo</em></h1> renders as "Foo").
411+ // eslint-disable-next-line no-control-regex
412+ text = text . replace ( / \x00 H E A D ( \d + ) \x00 / g, ( _match , idxStr ) => headings [ parseInt ( idxStr , 10 ) ] ) ;
413+
414+ // Step 6: Strip remaining markdown formatting on body and heading text.
415+ text = text
416+ // Remove link/image URLs, keep text: [text](url) → text
417+ . replace ( / ! ? \[ ( [ ^ \] ] * ) \] \( [ ^ ) ] * \) / g, '$1' )
474418 // Remove emphasis markers. * emphasis is stripped unconditionally.
475419 // _ emphasis is stripped only at word boundaries (per CommonMark,
476420 // _text_ is emphasis only when _ is not adjacent to an alphanumeric).
@@ -483,7 +427,7 @@ function extractMarkdownText(markdown: string): string {
483427 // Remove horizontal rules
484428 . replace ( / ^ [ - * _ ] { 3 , } $ / gm, '' ) ;
485429
486- // Step 4 : Restore code content (without backticks/fence markers)
430+ // Step 7 : Restore code content (without backticks/fence markers).
487431 // eslint-disable-next-line no-control-regex
488432 text = text . replace ( / \x00 C O D E ( \d + ) \x00 / g, ( _match , idxStr ) => codeSpans [ parseInt ( idxStr , 10 ) ] ) ;
489433 // eslint-disable-next-line no-control-regex
0 commit comments