Skip to content

Commit 4edcc3e

Browse files
committed
fix: preserve original .md URL from llms.txt for markdown-availability checks
When llms.txt linked to a .md/.mdx URL (notably Plaid's /index.html.md form), normalizePageUrl rewrote it to its HTML equivalent for sitemap dedup, then toMdUrls regenerated candidates from the HTML form that missed the URL the site actually published. markdown-url-support scored 0% on otherwise-compliant sites. Carry the original .md URL alongside the normalized URL through discovery as originalMdUrls. markdown-url-support tries it first, then falls through to toMdUrls() candidates, then a parent-clean fallback (gated to /index.html.md sources). toMdUrls itself is unchanged so other checks (llms-txt-directive-md, llms-txt-links- markdown) cannot regress to the prior false-positive class. Closes #77
1 parent 5692a52 commit 4edcc3e

7 files changed

Lines changed: 947 additions & 42 deletions

File tree

src/checks/markdown-availability/markdown-url-support.ts

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,25 @@ interface PageResult {
1212
alreadyMd?: boolean;
1313
status: number;
1414
error?: string;
15+
/** True when the original llms.txt-published URL served the content. */
16+
originalUrlServed?: boolean;
1517
}
1618

1719
/**
1820
* Detect whether the site prefers `page.md` (direct) or `page/index.md` (index)
1921
* based on which candidate succeeded in previous results.
2022
* Returns 'index' if `page/index.md` wins, 'direct' if `page.md` wins, or null if
2123
* there's no clear winner yet.
24+
*
25+
* Wins served via the `originalMdUrl` from llms.txt are NOT counted: those
26+
* URLs reflect the site's published form, not a `toMdUrls()` candidate, and
27+
* counting them would skew the heuristic for unrelated pages.
2228
*/
2329
function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null {
2430
let directWins = 0;
2531
let indexWins = 0;
2632
for (const r of results) {
27-
if (!r.supported || !r.mdUrl) continue;
33+
if (!r.supported || !r.mdUrl || r.originalUrlServed) continue;
2834
if (r.mdUrl.endsWith('/index.md') || r.mdUrl.endsWith('/index.mdx')) {
2935
indexWins++;
3036
} else {
@@ -38,6 +44,33 @@ function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null
3844
return null;
3945
}
4046

47+
/**
48+
* Issue #77: when llms.txt published a `/foo/index.html.md` URL but it 404s
49+
* (and the regenerated `/foo/index.md` and `/foo/index.html/index.md` also
50+
* 404), some sites still serve the markdown at the parent-clean path
51+
* `/foo.md` (Plaid's pattern). This is gated to URLs whose llms.txt original
52+
* matched `/index.html?\.md$` — strong evidence the site uses this convention.
53+
*
54+
* Do NOT move this into `toMdUrls()`. Other checks (`llms-txt-directive-md`,
55+
* `llms-txt-links-markdown`, etc.) call `toMdUrls()` directly and would
56+
* regress to the old false-positive class where unrelated sibling .md files
57+
* pass validation. See issue #77 discussion.
58+
*/
59+
function deriveParentCleanMd(pageUrl: string, originalMdUrl: string): string | null {
60+
if (!/\/index\.html?\.md$/i.test(new URL(originalMdUrl).pathname)) return null;
61+
try {
62+
const u = new URL(pageUrl);
63+
const pathname = u.pathname.replace(/\/$/, '');
64+
// Strip /index.html or /index.htm from the page URL and append .md
65+
const stripped = pathname.replace(/\/index\.html?$/i, '');
66+
if (!stripped || stripped === pathname) return null;
67+
u.pathname = `${stripped}.md`;
68+
return u.toString();
69+
} catch {
70+
return null;
71+
}
72+
}
73+
4174
/**
4275
* Reorder toMdUrls() candidates based on the detected site preference.
4376
* 'index' puts `page/index.md` first; 'direct' keeps the default order (`page.md` first).
@@ -58,6 +91,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
5891
totalPages,
5992
sampled: wasSampled,
6093
warnings,
94+
originalMdUrls,
6195
} = await discoverAndSamplePages(ctx);
6296

6397
const results: PageResult[] = [];
@@ -68,15 +102,35 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
68102
const batch = pageUrls.slice(i, i + concurrency);
69103
const batchResults = await Promise.all(
70104
batch.map(async (url): Promise<PageResult> => {
71-
const candidates = toMdUrls(url);
105+
const baseCandidates = toMdUrls(url);
72106
// Non-markdown file types (e.g. .json, .xml) have no .md equivalent — skip them
73-
if (candidates.length === 0) {
107+
if (baseCandidates.length === 0) {
74108
return { url, mdUrl: url, supported: false, skipped: true, status: 0 };
75109
}
76110
const alreadyMd = /\.mdx?$/i.test(new URL(url).pathname);
77-
const ordered = orderCandidates(candidates, mdFormPreference);
111+
const original = originalMdUrls?.[url];
112+
const parentClean = original ? deriveParentCleanMd(url, original) : null;
113+
114+
// Build candidate list:
115+
// 1. originalMdUrl (the URL llms.txt published) — first, when present.
116+
// 2. toMdUrls() candidates, reordered by detected site preference.
117+
// 3. parent-clean fallback (issue #77) — last, only when llms.txt
118+
// published a /foo/index.html.md form. Tried only if 1+2 fail.
119+
const ordered = orderCandidates(baseCandidates, mdFormPreference);
120+
const candidateList: string[] = [];
121+
const seen = new Set<string>();
122+
const addCandidate = (c: string | null | undefined) => {
123+
if (c && !seen.has(c)) {
124+
seen.add(c);
125+
candidateList.push(c);
126+
}
127+
};
128+
addCandidate(original);
129+
for (const c of ordered) addCandidate(c);
130+
addCandidate(parentClean);
131+
78132
let lastError: string | undefined;
79-
for (const mdUrl of ordered) {
133+
for (const mdUrl of candidateList) {
80134
try {
81135
const response = await ctx.http.fetch(mdUrl);
82136
const body = await response.text();
@@ -90,7 +144,14 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
90144
url,
91145
markdown: { content: body, source: 'md-url' },
92146
});
93-
return { url, mdUrl, supported: true, alreadyMd, status: response.status };
147+
return {
148+
url,
149+
mdUrl,
150+
supported: true,
151+
alreadyMd,
152+
status: response.status,
153+
originalUrlServed: mdUrl === original,
154+
};
94155
}
95156
lastError = undefined; // Got a response, not a fetch error
96157
} catch (err) {
@@ -99,7 +160,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
99160
}
100161
return {
101162
url,
102-
mdUrl: ordered[0],
163+
mdUrl: candidateList[0],
103164
supported: false,
104165
alreadyMd,
105166
status: 0,

0 commit comments

Comments
 (0)