Skip to content

Commit 0c5cff4

Browse files
authored
Merge pull request #37 from agent-ecosystem/content-negotiation-markdown-avail
Fix false positives in content-negotiation check
2 parents 43b5c79 + 2b28d2b commit 0c5cff4

9 files changed

Lines changed: 410 additions & 49 deletions

File tree

src/checks/content-discoverability/llms-txt-directive.ts

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { registerCheck } from '../registry.js';
22
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
3+
import { toHtmlUrl } from '../../helpers/to-md-urls.js';
34
import type { CheckContext, CheckResult } from '../../types.js';
45

56
interface DirectiveResult {
@@ -52,27 +53,6 @@ function extractBody(html: string): { body: string; offset: number } {
5253
return { body: html.slice(bodyStart, bodyEnd), offset: bodyStart };
5354
}
5455

55-
/**
56-
* Convert a markdown URL back to its HTML equivalent.
57-
* Strips trailing `.md` extension or `/index.md` suffix.
58-
*/
59-
function toHtmlUrl(url: string): string {
60-
try {
61-
const u = new URL(url);
62-
if (u.pathname.endsWith('.md')) {
63-
u.pathname = u.pathname.replace(/(?:\/index)?\.md$/, '') || '/';
64-
// Ensure trailing slash for directory-style URLs
65-
if (u.pathname !== '/' && !u.pathname.includes('.')) {
66-
u.pathname = u.pathname.replace(/\/?$/, '/');
67-
}
68-
return u.toString();
69-
}
70-
} catch {
71-
// Fall through to return original
72-
}
73-
return url;
74-
}
75-
7656
function searchContent(
7757
content: string,
7858
pattern: RegExp,

src/checks/markdown-availability/content-negotiation.ts

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import { registerCheck } from '../registry.js';
22
import { looksLikeMarkdown, looksLikeHtml } from '../../helpers/detect-markdown.js';
3+
import { isSoft404Body } from '../../helpers/detect-soft-404.js';
34
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
4-
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
5+
import { isNonPageUrl, isMdUrl, toHtmlUrl } from '../../helpers/to-md-urls.js';
56
import type { CheckContext, CheckResult } from '../../types.js';
67

78
type Classification = 'markdown-with-correct-type' | 'markdown-with-wrong-type' | 'html';
89

910
interface PageResult {
1011
url: string;
12+
/** The URL actually fetched (may differ from url if .md was normalized). */
13+
testedUrl?: string;
1114
classification: Classification;
1215
skipped?: boolean;
16+
softError?: boolean;
1317
contentType: string;
1418
status: number;
1519
error?: string;
@@ -37,12 +41,34 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
3741
if (isNonPageUrl(url)) {
3842
return { url, classification: 'html', skipped: true, contentType: '', status: 0 };
3943
}
44+
45+
// Pre-request: normalize .md/.mdx URLs to their canonical HTML form (#33).
46+
// Testing content negotiation against a .md URL is meaningless because the
47+
// server already serves markdown at that path by definition.
48+
const fetchUrl = isMdUrl(url) ? toHtmlUrl(url) : url;
49+
const testedUrl = fetchUrl !== url ? fetchUrl : undefined;
50+
4051
try {
41-
const response = await ctx.http.fetch(url, {
52+
const response = await ctx.http.fetch(fetchUrl, {
4253
headers: { Accept: 'text/markdown' },
4354
});
4455
const body = await response.text();
4556
const contentType = response.headers.get('content-type') ?? '';
57+
58+
// Post-response: reject soft-404 error pages (#29).
59+
// Some servers return 200 with text/markdown for error pages
60+
// (e.g. "# Page Not Found"), which would inflate scores.
61+
if (isSoft404Body(body)) {
62+
return {
63+
url,
64+
testedUrl,
65+
classification: 'html',
66+
softError: true,
67+
contentType,
68+
status: response.status,
69+
};
70+
}
71+
4672
const isMarkdownType = contentType.includes('text/markdown');
4773
const isMarkdownBody = looksLikeMarkdown(body);
4874

@@ -68,10 +94,11 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
6894
classification = 'html';
6995
}
7096

71-
return { url, classification, contentType, status: response.status };
97+
return { url, testedUrl, classification, contentType, status: response.status };
7298
} catch (err) {
7399
return {
74100
url,
101+
testedUrl,
75102
classification: 'html',
76103
contentType: '',
77104
status: 0,
@@ -85,6 +112,8 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
85112

86113
const testedResults = results.filter((r) => !r.skipped);
87114
const skippedCount = results.length - testedResults.length;
115+
const normalizedCount = testedResults.filter((r) => r.testedUrl).length;
116+
const softErrorCount = testedResults.filter((r) => r.softError).length;
88117
const markdownWithCorrectType = testedResults.filter(
89118
(r) => r.classification === 'markdown-with-correct-type',
90119
).length;
@@ -102,12 +131,16 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
102131
const pageLabel = wasSampled ? 'sampled pages' : 'pages';
103132
const suffix =
104133
(fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '') +
105-
(rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : '');
134+
(rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : '') +
135+
(softErrorCount > 0 ? `; ${softErrorCount} returned error pages` : '') +
136+
(normalizedCount > 0 ? `; ${normalizedCount} .md URLs normalized` : '');
106137

107138
const details: Record<string, unknown> = {
108139
totalPages,
109140
testedPages: testedResults.length,
110141
skippedPages: skippedCount,
142+
normalizedMdUrls: normalizedCount,
143+
softErrorPages: softErrorCount,
111144
sampled: wasSampled,
112145
markdownWithCorrectType,
113146
markdownWithWrongType,

src/checks/observability/markdown-content-parity.ts

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { parse } from 'node-html-parser';
22
import { registerCheck } from '../registry.js';
33
import { fetchPage } from '../../helpers/fetch-page.js';
4+
import { toHtmlUrl } from '../../helpers/to-md-urls.js';
45
import type { CheckContext, CheckResult, CheckStatus } from '../../types.js';
56

67
/** Thresholds for the percentage of HTML segments not found in markdown. */
@@ -582,27 +583,6 @@ function computeParity(
582583
};
583584
}
584585

585-
/**
586-
* Derive the HTML page URL from a cached page URL.
587-
* Inverts the transforms from toMdUrls():
588-
* /docs/guide.md → /docs/guide
589-
* /docs/guide/index.md → /docs/guide/
590-
* /docs/guide.mdx → /docs/guide
591-
* If the URL doesn't end in .md/.mdx, return it unchanged.
592-
*/
593-
function toHtmlUrl(url: string): string {
594-
const parsed = new URL(url);
595-
if (parsed.pathname.endsWith('/index.md') || parsed.pathname.endsWith('/index.mdx')) {
596-
parsed.pathname = parsed.pathname.replace(/\/index\.mdx?$/, '/');
597-
return parsed.toString();
598-
}
599-
if (/\.mdx?$/i.test(parsed.pathname)) {
600-
parsed.pathname = parsed.pathname.replace(/\.mdx?$/i, '');
601-
return parsed.toString();
602-
}
603-
return url;
604-
}
605-
606586
function worstStatus(statuses: CheckStatus[]): CheckStatus {
607587
if (statuses.includes('fail')) return 'fail';
608588
if (statuses.includes('warn')) return 'warn';

src/checks/url-stability/http-status-codes.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { registerCheck } from '../registry.js';
22
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
3+
import { SOFT_404_PATTERNS } from '../../helpers/detect-soft-404.js';
34
import type { CheckContext, CheckResult } from '../../types.js';
45

56
interface StatusCodeResult {
@@ -21,8 +22,6 @@ function makeBadUrl(pageUrl: string): string {
2122
return u.toString();
2223
}
2324

24-
const SOFT_404_PATTERNS = /not\s*found|page\s*not\s*found|404|does\s*not\s*exist/i;
25-
2625
async function check(ctx: CheckContext): Promise<CheckResult> {
2726
const id = 'http-status-codes';
2827
const category = 'url-stability';

src/helpers/detect-soft-404.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/**
2+
* Broad soft-404 detection pattern.
3+
*
4+
* Matches common "not found" text in response bodies. Used by http-status-codes
5+
* as a hint on pages already suspected of being soft-404s (fabricated bad URLs
6+
* that returned 200).
7+
*/
8+
export const SOFT_404_PATTERNS = /not\s*found|page\s*not\s*found|404|does\s*not\s*exist/i;
9+
10+
/**
11+
* Returns true if a markdown response body looks like an error page rather than
12+
* real content. This is stricter than SOFT_404_PATTERNS because it runs on
13+
* legitimate page URLs where documentation might naturally mention "404".
14+
*
15+
* Detection strategy:
16+
* 1. If the first markdown heading contains error patterns, it's an error page.
17+
* Real error pages say "# Page Not Found"; real docs don't lead with that.
18+
* 2. If the body is very short (< 500 chars), scan it entirely. Terse error
19+
* responses like "Not found" should still be caught.
20+
*/
21+
export function isSoft404Body(body: string): boolean {
22+
// Check the first markdown heading (e.g. "# Page Not Found")
23+
const headingMatch = /^#{1,6}\s+(.+)/m.exec(body.slice(0, 500));
24+
if (headingMatch && SOFT_404_PATTERNS.test(headingMatch[1])) {
25+
return true;
26+
}
27+
28+
// For very short bodies, scan the whole thing. A real page has substantial
29+
// content; a terse error message like "Not found" or "404" is short.
30+
if (body.length < 500) {
31+
return SOFT_404_PATTERNS.test(body);
32+
}
33+
34+
return false;
35+
}

src/helpers/to-md-urls.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,43 @@ export function isNonPageUrl(url: string): boolean {
3838
);
3939
}
4040

41+
/**
42+
* Convert a .md or .mdx URL back to its canonical HTML equivalent.
43+
* Inverts the transforms from toMdUrls():
44+
* /docs/guide.md -> /docs/guide
45+
* /docs/guide/index.md -> /docs/guide/
46+
* /docs/guide.mdx -> /docs/guide
47+
* If the URL doesn't end in .md/.mdx, return it unchanged.
48+
*/
49+
export function toHtmlUrl(url: string): string {
50+
try {
51+
const parsed = new URL(url);
52+
if (parsed.pathname.endsWith('/index.md') || parsed.pathname.endsWith('/index.mdx')) {
53+
parsed.pathname = parsed.pathname.replace(/\/index\.mdx?$/, '/');
54+
return parsed.toString();
55+
}
56+
if (/\.mdx?$/i.test(parsed.pathname)) {
57+
parsed.pathname = parsed.pathname.replace(/\.mdx?$/i, '');
58+
return parsed.toString();
59+
}
60+
} catch {
61+
// Fall through to return original
62+
}
63+
return url;
64+
}
65+
66+
/**
67+
* Returns true if the URL points to a .md or .mdx file.
68+
*/
69+
export function isMdUrl(url: string): boolean {
70+
try {
71+
const parsed = new URL(url);
72+
return /\.mdx?$/i.test(parsed.pathname);
73+
} catch {
74+
return false;
75+
}
76+
}
77+
4178
/**
4279
* Generate candidate .md URLs for a page URL.
4380
* If the URL already ends in .md, return it as-is.

0 commit comments

Comments
 (0)