@@ -1191,9 +1191,10 @@ describe('getPageUrls', () => {
11911191
11921192 const ctx = makeCtx ( 'http://walk-test.local' , rootContent ) ;
11931193 const result = await getPageUrls ( ctx ) ;
1194- expect ( result . urls ) . toContain ( 'http://walk-test.local/workers/guide/index.md' ) ;
1195- expect ( result . urls ) . toContain ( 'http://walk-test.local/workers/api/index.md' ) ;
1196- expect ( result . urls ) . toContain ( 'http://walk-test.local/cache/overview/index.md' ) ;
1194+ // .md URLs from llms.txt are normalized to their HTML equivalents
1195+ expect ( result . urls ) . toContain ( 'http://walk-test.local/workers/guide/' ) ;
1196+ expect ( result . urls ) . toContain ( 'http://walk-test.local/workers/api/' ) ;
1197+ expect ( result . urls ) . toContain ( 'http://walk-test.local/cache/overview/' ) ;
11971198 expect ( result . urls ) . toHaveLength ( 3 ) ;
11981199 } ) ;
11991200
@@ -1296,6 +1297,43 @@ describe('getPageUrls', () => {
12961297 expect ( result . urls ) . toEqual ( [ 'http://walk-empty.local/docs/page' ] ) ;
12971298 } ) ;
12981299
1300+ // ── .md URL normalization ──
1301+
1302+ it ( 'normalizes .md URLs from llms.txt to HTML equivalents' , async ( ) => {
1303+ const content = `# Docs\n- [Guide](http://md-norm.local/docs/guide/index.md): Guide\n- [API](http://md-norm.local/docs/api.md): API\n` ;
1304+ const ctx = makeCtx ( 'http://md-norm.local' , content ) ;
1305+ const result = await getPageUrls ( ctx ) ;
1306+ expect ( result . urls ) . toContain ( 'http://md-norm.local/docs/guide/' ) ;
1307+ expect ( result . urls ) . toContain ( 'http://md-norm.local/docs/api' ) ;
1308+ expect ( result . urls ) . not . toContain ( 'http://md-norm.local/docs/guide/index.md' ) ;
1309+ expect ( result . urls ) . not . toContain ( 'http://md-norm.local/docs/api.md' ) ;
1310+ } ) ;
1311+
1312+ it ( 'deduplicates .md and HTML URLs for the same page' , async ( ) => {
1313+ // llms.txt has .md URL, sitemap has HTML URL for the same page
1314+ const content = `# Docs\n- [Guide](http://md-dedup.local/docs/guide/index.md): Guide\n` ;
1315+ const ctx = makeCtx ( 'http://md-dedup.local' , content ) ;
1316+
1317+ server . use (
1318+ http . get ( 'http://md-dedup.local/robots.txt' , ( ) => new HttpResponse ( '' , { status : 404 } ) ) ,
1319+ http . get (
1320+ 'http://md-dedup.local/sitemap.xml' ,
1321+ ( ) =>
1322+ new HttpResponse (
1323+ `<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>http://md-dedup.local/docs/guide/</loc></url><url><loc>http://md-dedup.local/docs/other/</loc></url></urlset>` ,
1324+ { status : 200 , headers : { 'Content-Type' : 'application/xml' } } ,
1325+ ) ,
1326+ ) ,
1327+ ) ;
1328+
1329+ const result = await getPageUrls ( ctx ) ;
1330+ // /docs/guide/ should appear only once (not twice for .md + HTML)
1331+ const guideCount = result . urls . filter ( ( u ) => u === 'http://md-dedup.local/docs/guide/' ) . length ;
1332+ expect ( guideCount ) . toBe ( 1 ) ;
1333+ // /docs/other/ from sitemap should still be present
1334+ expect ( result . urls ) . toContain ( 'http://md-dedup.local/docs/other/' ) ;
1335+ } ) ;
1336+
12991337 // ── Direct llms.txt fetch (standalone mode) ──
13001338
13011339 it ( 'fetches llms.txt directly when llms-txt-exists has not run' , async ( ) => {
0 commit comments