@@ -12,19 +12,25 @@ interface PageResult {
1212 alreadyMd ?: boolean ;
1313 status : number ;
1414 error ?: string ;
15+ /** True when the original llms.txt-published URL served the content. */
16+ originalUrlServed ?: boolean ;
1517}
1618
1719/**
1820 * Detect whether the site prefers `page.md` (direct) or `page/index.md` (index)
1921 * based on which candidate succeeded in previous results.
2022 * Returns 'index' if `page/index.md` wins, 'direct' if `page.md` wins, or null if
2123 * there's no clear winner yet.
24+ *
25+ * Wins served via the `originalMdUrl` from llms.txt are NOT counted: those
26+ * URLs reflect the site's published form, not a `toMdUrls()` candidate, and
27+ * counting them would skew the heuristic for unrelated pages.
2228 */
2329function detectPreferredMdForm ( results : PageResult [ ] ) : 'direct' | 'index' | null {
2430 let directWins = 0 ;
2531 let indexWins = 0 ;
2632 for ( const r of results ) {
27- if ( ! r . supported || ! r . mdUrl ) continue ;
33+ if ( ! r . supported || ! r . mdUrl || r . originalUrlServed ) continue ;
2834 if ( r . mdUrl . endsWith ( '/index.md' ) || r . mdUrl . endsWith ( '/index.mdx' ) ) {
2935 indexWins ++ ;
3036 } else {
@@ -38,6 +44,33 @@ function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null
3844 return null ;
3945}
4046
47+ /**
48+ * Issue #77: when llms.txt published a `/foo/index.html.md` URL but it 404s
49+ * (and the regenerated `/foo/index.md` and `/foo/index.html/index.md` also
50+ * 404), some sites still serve the markdown at the parent-clean path
51+ * `/foo.md` (Plaid's pattern). This is gated to URLs whose llms.txt original
52+ * matched `/index.html?\.md$` — strong evidence the site uses this convention.
53+ *
54+ * Do NOT move this into `toMdUrls()`. Other checks (`llms-txt-directive-md`,
55+ * `llms-txt-links-markdown`, etc.) call `toMdUrls()` directly and would
56+ * regress to the old false-positive class where unrelated sibling .md files
57+ * pass validation. See issue #77 discussion.
58+ */
59+ function deriveParentCleanMd ( pageUrl : string , originalMdUrl : string ) : string | null {
60+ if ( ! / \/ i n d e x \. h t m l ? \. m d $ / i. test ( new URL ( originalMdUrl ) . pathname ) ) return null ;
61+ try {
62+ const u = new URL ( pageUrl ) ;
63+ const pathname = u . pathname . replace ( / \/ $ / , '' ) ;
64+ // Strip /index.html or /index.htm from the page URL and append .md
65+ const stripped = pathname . replace ( / \/ i n d e x \. h t m l ? $ / i, '' ) ;
66+ if ( ! stripped || stripped === pathname ) return null ;
67+ u . pathname = `${ stripped } .md` ;
68+ return u . toString ( ) ;
69+ } catch {
70+ return null ;
71+ }
72+ }
73+
4174/**
4275 * Reorder toMdUrls() candidates based on the detected site preference.
4376 * 'index' puts `page/index.md` first; 'direct' keeps the default order (`page.md` first).
@@ -58,6 +91,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
5891 totalPages,
5992 sampled : wasSampled ,
6093 warnings,
94+ originalMdUrls,
6195 } = await discoverAndSamplePages ( ctx ) ;
6296
6397 const results : PageResult [ ] = [ ] ;
@@ -68,15 +102,35 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
68102 const batch = pageUrls . slice ( i , i + concurrency ) ;
69103 const batchResults = await Promise . all (
70104 batch . map ( async ( url ) : Promise < PageResult > => {
71- const candidates = toMdUrls ( url ) ;
105+ const baseCandidates = toMdUrls ( url ) ;
72106 // Non-markdown file types (e.g. .json, .xml) have no .md equivalent — skip them
73- if ( candidates . length === 0 ) {
107+ if ( baseCandidates . length === 0 ) {
74108 return { url, mdUrl : url , supported : false , skipped : true , status : 0 } ;
75109 }
76110 const alreadyMd = / \. m d x ? $ / i. test ( new URL ( url ) . pathname ) ;
77- const ordered = orderCandidates ( candidates , mdFormPreference ) ;
111+ const original = originalMdUrls ?. [ url ] ;
112+ const parentClean = original ? deriveParentCleanMd ( url , original ) : null ;
113+
114+ // Build candidate list:
115+ // 1. originalMdUrl (the URL llms.txt published) — first, when present.
116+ // 2. toMdUrls() candidates, reordered by detected site preference.
117+ // 3. parent-clean fallback (issue #77) — last, only when llms.txt
118+ // published a /foo/index.html.md form. Tried only if 1+2 fail.
119+ const ordered = orderCandidates ( baseCandidates , mdFormPreference ) ;
120+ const candidateList : string [ ] = [ ] ;
121+ const seen = new Set < string > ( ) ;
122+ const addCandidate = ( c : string | null | undefined ) => {
123+ if ( c && ! seen . has ( c ) ) {
124+ seen . add ( c ) ;
125+ candidateList . push ( c ) ;
126+ }
127+ } ;
128+ addCandidate ( original ) ;
129+ for ( const c of ordered ) addCandidate ( c ) ;
130+ addCandidate ( parentClean ) ;
131+
78132 let lastError : string | undefined ;
79- for ( const mdUrl of ordered ) {
133+ for ( const mdUrl of candidateList ) {
80134 try {
81135 const response = await ctx . http . fetch ( mdUrl ) ;
82136 const body = await response . text ( ) ;
@@ -90,7 +144,14 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
90144 url,
91145 markdown : { content : body , source : 'md-url' } ,
92146 } ) ;
93- return { url, mdUrl, supported : true , alreadyMd, status : response . status } ;
147+ return {
148+ url,
149+ mdUrl,
150+ supported : true ,
151+ alreadyMd,
152+ status : response . status ,
153+ originalUrlServed : mdUrl === original ,
154+ } ;
94155 }
95156 lastError = undefined ; // Got a response, not a fetch error
96157 } catch ( err ) {
@@ -99,7 +160,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
99160 }
100161 return {
101162 url,
102- mdUrl : ordered [ 0 ] ,
163+ mdUrl : candidateList [ 0 ] ,
103164 supported : false ,
104165 alreadyMd,
105166 status : 0 ,
0 commit comments