Skip to content

Commit facd16d

Browse files
committed
Fix false positives, improve verbose output, and cap
HTTP retries - Skip non-page file types (.json, .xml) in content-negotiation check - Add alreadyMd flag to markdown-url-support so .md URLs that serve HTML get a distinct message instead of 'no .md URL found' - Improve content-start-position heuristic: add linkDensity and headingFollowedByContent look-ahead to skip nav/sidebar headings, tighten prose fallback thresholds, fix CSS pattern to match custom properties with digits (fixes Stripe 0% false positive) - Ignore code fences inside markdown table cells (vendor extension) - Make all verbose formatter messages descriptive: content-negotiation shows actual content-type, markdown-url-support distinguishes .md URLs serving HTML, add per-page detail for markdown-code-fence-validity - Cap HTTP 429 retries at 2 to prevent infinite retry loops that caused the CLI to hang on rate-limited sites like Stripe - Add tests for all new behavior
1 parent eb1e5fc commit facd16d

12 files changed

Lines changed: 489 additions & 88 deletions

File tree

src/checks/content-structure/markdown-code-fence-validity.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ function analyzeFences(content: string): { fenceCount: number; issues: FenceIssu
4040
const match = FENCE_RE.exec(stripped);
4141
if (!match) continue;
4242

43+
// Skip fences inside markdown table cells (e.g. "``` | ```" or "| ```")
44+
// These aren't real CommonMark fences — multi-line table cells are a vendor extension
45+
if (stripped.includes('|')) continue;
46+
4347
const char = match[3] ? '`' : '~';
4448
const length = (match[3] || match[4]).length;
4549

src/checks/markdown-availability/content-negotiation.ts

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import { registerCheck } from '../registry.js';
22
import { looksLikeMarkdown } from '../../helpers/detect-markdown.js';
33
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
4+
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
45
import type { CheckContext, CheckResult } from '../../types.js';
56

67
type Classification = 'markdown-with-correct-type' | 'markdown-with-wrong-type' | 'html';
78

89
interface PageResult {
910
url: string;
1011
classification: Classification;
12+
skipped?: boolean;
1113
contentType: string;
1214
status: number;
1315
error?: string;
@@ -31,6 +33,10 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
3133
const batch = pageUrls.slice(i, i + concurrency);
3234
const batchResults = await Promise.all(
3335
batch.map(async (url): Promise<PageResult> => {
36+
// Non-page file types (e.g. .json, .xml) are already in a machine-readable format
37+
if (isNonPageUrl(url)) {
38+
return { url, classification: 'html', skipped: true, contentType: '', status: 0 };
39+
}
3440
try {
3541
const response = await ctx.http.fetch(url, {
3642
headers: { Accept: 'text/markdown' },
@@ -77,16 +83,21 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
7783
results.push(...batchResults);
7884
}
7985

80-
const markdownWithCorrectType = results.filter(
86+
const testedResults = results.filter((r) => !r.skipped);
87+
const skippedCount = results.length - testedResults.length;
88+
const markdownWithCorrectType = testedResults.filter(
8189
(r) => r.classification === 'markdown-with-correct-type',
8290
).length;
83-
const markdownWithWrongType = results.filter(
91+
const markdownWithWrongType = testedResults.filter(
8492
(r) => r.classification === 'markdown-with-wrong-type',
8593
).length;
86-
const htmlOnly = results.filter((r) => r.classification === 'html').length;
87-
const negotiationRate = Math.round((markdownWithCorrectType / results.length) * 100);
88-
const fetchErrors = results.filter((r) => r.error).length;
89-
const rateLimited = results.filter((r) => r.status === 429).length;
94+
const htmlOnly = testedResults.filter((r) => r.classification === 'html').length;
95+
const negotiationRate =
96+
testedResults.length > 0
97+
? Math.round((markdownWithCorrectType / testedResults.length) * 100)
98+
: 0;
99+
const fetchErrors = testedResults.filter((r) => r.error).length;
100+
const rateLimited = testedResults.filter((r) => r.status === 429).length;
90101

91102
const pageLabel = wasSampled ? 'sampled pages' : 'pages';
92103
const suffix =
@@ -95,7 +106,8 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
95106

96107
const details: Record<string, unknown> = {
97108
totalPages,
98-
testedPages: results.length,
109+
testedPages: testedResults.length,
110+
skippedPages: skippedCount,
99111
sampled: wasSampled,
100112
markdownWithCorrectType,
101113
markdownWithWrongType,
@@ -112,7 +124,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
112124
id,
113125
category,
114126
status: 'pass',
115-
message: `${markdownWithCorrectType}/${results.length} ${pageLabel} support content negotiation (${negotiationRate}%)${suffix}`,
127+
message: `${markdownWithCorrectType}/${testedResults.length} ${pageLabel} support content negotiation (${negotiationRate}%)${suffix}`,
116128
details,
117129
};
118130
}
@@ -131,7 +143,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
131143
id,
132144
category,
133145
status: 'fail',
134-
message: `Server ignores Accept: text/markdown header (0/${results.length} ${pageLabel} return markdown)${suffix}`,
146+
message: `Server ignores Accept: text/markdown header (0/${testedResults.length} ${pageLabel} return markdown)${suffix}`,
135147
details,
136148
};
137149
}

src/checks/markdown-availability/markdown-url-support.ts

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ interface PageResult {
88
url: string;
99
mdUrl: string;
1010
supported: boolean;
11+
skipped?: boolean;
12+
alreadyMd?: boolean;
1113
status: number;
1214
error?: string;
1315
}
@@ -31,6 +33,11 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
3133
const batchResults = await Promise.all(
3234
batch.map(async (url): Promise<PageResult> => {
3335
const candidates = toMdUrls(url);
36+
// Non-markdown file types (e.g. .json, .xml) have no .md equivalent — skip them
37+
if (candidates.length === 0) {
38+
return { url, mdUrl: url, supported: false, skipped: true, status: 0 };
39+
}
40+
const alreadyMd = /\.mdx?$/i.test(new URL(url).pathname);
3441
let lastError: string | undefined;
3542
for (const mdUrl of candidates) {
3643
try {
@@ -46,24 +53,34 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
4653
url,
4754
markdown: { content: body, source: 'md-url' },
4855
});
49-
return { url, mdUrl, supported: true, status: response.status };
56+
return { url, mdUrl, supported: true, alreadyMd, status: response.status };
5057
}
5158
lastError = undefined; // Got a response, not a fetch error
5259
} catch (err) {
5360
lastError = err instanceof Error ? err.message : String(err);
5461
}
5562
}
56-
return { url, mdUrl: candidates[0], supported: false, status: 0, error: lastError };
63+
return {
64+
url,
65+
mdUrl: candidates[0],
66+
supported: false,
67+
alreadyMd,
68+
status: 0,
69+
error: lastError,
70+
};
5771
}),
5872
);
5973
results.push(...batchResults);
6074
}
6175

62-
const mdSupported = results.filter((r) => r.supported).length;
63-
const mdUnsupported = results.length - mdSupported;
64-
const supportRate = Math.round((mdSupported / results.length) * 100);
65-
const fetchErrors = results.filter((r) => r.error).length;
66-
const rateLimited = results.filter((r) => r.status === 429).length;
76+
const testedResults = results.filter((r) => !r.skipped);
77+
const skippedCount = results.length - testedResults.length;
78+
const mdSupported = testedResults.filter((r) => r.supported).length;
79+
const mdUnsupported = testedResults.length - mdSupported;
80+
const supportRate =
81+
testedResults.length > 0 ? Math.round((mdSupported / testedResults.length) * 100) : 0;
82+
const fetchErrors = testedResults.filter((r) => r.error).length;
83+
const rateLimited = testedResults.filter((r) => r.status === 429).length;
6784

6885
const pageLabel = wasSampled ? 'sampled pages' : 'pages';
6986
const suffix =
@@ -72,7 +89,8 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
7289

7390
const details: Record<string, unknown> = {
7491
totalPages,
75-
testedPages: results.length,
92+
testedPages: testedResults.length,
93+
skippedPages: skippedCount,
7694
sampled: wasSampled,
7795
mdSupported,
7896
mdUnsupported,
@@ -88,7 +106,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
88106
id,
89107
category,
90108
status: 'pass',
91-
message: `${mdSupported}/${results.length} ${pageLabel} support .md URLs (${supportRate}%)${suffix}`,
109+
message: `${mdSupported}/${testedResults.length} ${pageLabel} support .md URLs (${supportRate}%)${suffix}`,
92110
details,
93111
};
94112
}
@@ -98,7 +116,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
98116
id,
99117
category,
100118
status: 'warn',
101-
message: `${mdSupported}/${results.length} ${pageLabel} support .md URLs (${supportRate}%); inconsistent support${suffix}`,
119+
message: `${mdSupported}/${testedResults.length} ${pageLabel} support .md URLs (${supportRate}%); inconsistent support${suffix}`,
102120
details,
103121
};
104122
}
@@ -107,7 +125,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
107125
id,
108126
category,
109127
status: 'fail',
110-
message: `No ${pageLabel} support .md URLs (0/${results.length} tested)${suffix}`,
128+
message: `No ${pageLabel} support .md URLs (0/${testedResults.length} tested)${suffix}`,
111129
details,
112130
};
113131
}

src/checks/page-size/content-start-position.ts

Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,76 @@ interface PagePositionResult {
1313
error?: string;
1414
}
1515

16-
const CSS_PATTERN = /[{}\s]*[a-z-]+\s*:\s*[^;]+;/;
16+
const CSS_PATTERN = /[{}\s]*[a-z0-9_-]+\s*:\s*[^;]+;/;
1717
const JS_PATTERNS = [/^\s*(function|var|const|let|import|export)\b/, /^\s*\/\//, /[{};]\s*$/];
18+
const INLINE_SCRIPT_MIN_LENGTH = 200;
19+
const INLINE_SCRIPT_TOKENS =
20+
/function\s*\(|=>\s*\{|document\.|window\.|localStorage|\.addEventListener|\.getElementById|\.querySelector|\.setAttribute|self\.\\/;
1821
const NAV_MAX_LENGTH = 40;
1922

23+
/** Measure how much of a line is markdown link syntax: `[text](url)` or `[![img](src)](url)` */
24+
function linkDensity(line: string): number {
25+
// Match plain links [text](url) and image links [![alt](src)](url)
26+
const links = line.match(/\[(?:[^[\]]*|!\[[^\]]*\]\([^)]*\))*\]\([^)]*\)/g);
27+
if (!links) return 0;
28+
return links.join('').length / line.length;
29+
}
30+
31+
/** Returns true if the line is script or CSS content that should be ignored. */
32+
function isBoilerplateLine(line: string): boolean {
33+
if (CSS_PATTERN.test(line)) return true;
34+
if (JS_PATTERNS.some((p) => p.test(line))) return true;
35+
if (line.length >= INLINE_SCRIPT_MIN_LENGTH && INLINE_SCRIPT_TOKENS.test(line)) return true;
36+
return false;
37+
}
38+
39+
/**
40+
* Check whether a heading is followed by prose (content heading) rather than
41+
* a list of links (sidebar/nav heading). Looks ahead up to 6 non-empty,
42+
* non-boilerplate lines after the heading for a prose paragraph.
43+
*/
44+
function headingFollowedByContent(lines: string[], headingIdx: number): boolean {
45+
// Skip the heading line itself and any setext underline
46+
let start = headingIdx + 1;
47+
if (start < lines.length && /^[=-]+$/.test(lines[start].trim())) {
48+
start++;
49+
}
50+
51+
let nonEmptyCount = 0;
52+
for (let i = start; i < lines.length && nonEmptyCount < 6; i++) {
53+
const t = lines[i].trim();
54+
if (t.length === 0) continue;
55+
56+
// Skip script/CSS boilerplate — don't count it as "content after heading"
57+
if (isBoilerplateLine(t)) continue;
58+
59+
nonEmptyCount++;
60+
61+
// If we hit a link-heavy line or a list item starting with [, keep scanning
62+
if (linkDensity(t) > 0.5) continue;
63+
if (/^\*\s+\[/.test(t)) continue;
64+
if (/^\]\(/.test(t)) continue;
65+
66+
// Another heading (ATX or setext) means the previous one had no prose body
67+
if (/^#{1,6}\s/.test(t)) return false;
68+
const nextLine = i + 1 < lines.length ? lines[i + 1].trim() : '';
69+
if (/^[=-]+$/.test(nextLine) && nextLine.length >= 2) return false;
70+
71+
// A line > NAV_MAX_LENGTH that isn't a link is likely real prose
72+
if (t.length > NAV_MAX_LENGTH && linkDensity(t) < 0.5) return true;
73+
74+
// A shorter line with sentence-ending punctuation is also prose
75+
if (/[.!?]$/.test(t) && t.length >= 10 && linkDensity(t) < 0.5) return true;
76+
77+
// Short lines under headings are typically nav items
78+
}
79+
return false;
80+
}
81+
2082
/**
2183
* Find the character position where meaningful content begins in converted markdown.
22-
* Meaningful content is a heading or a prose paragraph (not CSS, JS, or short nav text).
84+
* Meaningful content is a heading followed by prose, or a standalone prose paragraph
85+
* that isn't navigation, scripts, CSS, or link-heavy boilerplate.
2386
*/
2487
function findContentStart(markdown: string): number {
2588
const lines = markdown.split('\n');
@@ -34,37 +97,52 @@ function findContentStart(markdown: string): number {
3497
continue;
3598
}
3699

37-
// ATX heading: starts with # at beginning of line
38-
if (/^#{1,6}\s+\S/.test(trimmed)) {
39-
return charPos;
100+
// ATX heading h1-h4 followed by prose content (not a nav/sidebar heading)
101+
if (/^#{1,4}\s+\S/.test(trimmed) && !/^#{5,6}\s/.test(trimmed)) {
102+
if (headingFollowedByContent(lines, idx)) {
103+
return charPos;
104+
}
105+
// Otherwise skip it as a sidebar/nav heading
106+
charPos += line.length + 1;
107+
continue;
40108
}
41109

42-
// Setext heading: current line is text, next line is === or ---
110+
// Setext heading followed by prose content
43111
const nextLine = idx + 1 < lines.length ? lines[idx + 1].trim() : '';
44112
if (/^[=-]+$/.test(nextLine) && nextLine.length >= 2 && trimmed.length > 0) {
45-
return charPos;
113+
if (headingFollowedByContent(lines, idx)) {
114+
return charPos;
115+
}
116+
charPos += line.length + 1;
117+
continue;
46118
}
47119

48-
// Skip CSS-like lines
49-
if (CSS_PATTERN.test(trimmed)) {
120+
// Skip CSS, JS, and inline script boilerplate
121+
if (isBoilerplateLine(trimmed)) {
50122
charPos += line.length + 1;
51123
continue;
52124
}
53125

54-
// Skip JS-like lines
55-
if (JS_PATTERNS.some((p) => p.test(trimmed))) {
126+
// Skip lines dominated by markdown link syntax (nav bars, TOC, link lists)
127+
if (linkDensity(trimmed) > 0.5) {
56128
charPos += line.length + 1;
57129
continue;
58130
}
59131

60-
// Skip very short nav-like tokens (e.g., "Home", "Docs", "API")
61-
if (trimmed.length <= NAV_MAX_LENGTH && !/[.!?]/.test(trimmed) && !trimmed.includes(' ')) {
132+
// Skip bare link fragments from Turndown splitting links across lines: `](/path)`
133+
if (/^\]\(/.test(trimmed)) {
62134
charPos += line.length + 1;
63135
continue;
64136
}
65137

66-
// Prose-like paragraph: contains spaces (multiple words) and is reasonably long
67-
if (trimmed.length > NAV_MAX_LENGTH || (trimmed.includes(' ') && trimmed.length > 20)) {
138+
// Standalone prose paragraph (not preceded by a heading we recognized).
139+
// Must be a strong signal of real content to avoid matching UI chrome like
140+
// "Press Enter to activate dropdown" or "Select language: English".
141+
// Require sentence punctuation + substantial length, or very long text.
142+
if (trimmed.length >= 80 && linkDensity(trimmed) < 0.5) {
143+
return charPos;
144+
}
145+
if (/[.!?]$/.test(trimmed) && trimmed.length >= 40 && linkDensity(trimmed) < 0.5) {
68146
return charPos;
69147
}
70148

src/cli/formatters/text.ts

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,50 @@ const DETAIL_FORMATTERS: Record<string, DetailFormatter> = {
6969
const pages = details.pageResults as PageResult[] | undefined;
7070
if (!pages) return [];
7171
return pages
72-
.filter((p) => !p.supported)
73-
.map((p) => formatDetailLine('warn', p.url, 'no .md URL found'));
72+
.filter((p) => !p.supported && !p.skipped)
73+
.map((p) => {
74+
const msg = p.alreadyMd ? '.md URL serves HTML, not markdown' : 'no .md URL found';
75+
return formatDetailLine('warn', p.url, msg);
76+
});
7477
},
7578

7679
'content-negotiation': (details) => {
7780
const pages = details.pageResults as PageResult[] | undefined;
7881
if (!pages) return [];
7982
return pages
80-
.filter((p) => p.status !== 'pass')
83+
.filter((p) => p.classification !== 'markdown-with-correct-type' && !p.skipped)
8184
.map((p) => {
82-
const classification = (p.classification as string) ?? '';
83-
return formatDetailLine(p.status, p.url, classification);
85+
const status = p.classification === 'markdown-with-wrong-type' ? 'warn' : 'fail';
86+
const urlIsMd = /\.mdx?$/i.test(new URL(p.url).pathname);
87+
let label: string;
88+
if (p.classification === 'markdown-with-wrong-type') {
89+
const ct = (p.contentType as string) || 'unknown';
90+
label = `returns markdown but content-type is ${ct}`;
91+
} else if (urlIsMd) {
92+
label = '.md URL serves HTML, not markdown';
93+
} else {
94+
label = 'returns HTML, ignores Accept header';
95+
}
96+
return formatDetailLine(status, p.url, label);
97+
});
98+
},
99+
100+
'markdown-code-fence-validity': (details) => {
101+
const pages = details.pageResults as PageResult[] | undefined;
102+
if (!pages) return [];
103+
return pages
104+
.filter((p) => p.status !== 'pass')
105+
.flatMap((p) => {
106+
const issues =
107+
(p.issues as Array<{ line: number; type: string; opener: string; closer?: string }>) ??
108+
[];
109+
return issues.map((issue) => {
110+
const info =
111+
issue.type === 'unclosed'
112+
? `unclosed ${issue.opener} at line ${issue.line}`
113+
: `${issue.opener} closed with ${issue.closer} at line ${issue.line}`;
114+
return formatDetailLine(issue.type === 'unclosed' ? 'fail' : 'warn', p.url, info);
115+
});
84116
});
85117
},
86118

0 commit comments

Comments
 (0)