Skip to content

Commit f760c3a

Browse files
authored
Merge pull request #82 from agent-ecosystem/fix/sitemap-discovery-regression
Fix sitemap discovery regression
2 parents 873ef86 + 209f0d9 commit f760c3a

13 files changed

Lines changed: 154 additions & 17 deletions

src/helpers/get-page-urls.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,14 +328,21 @@ async function discoverSitemapUrls(ctx: CheckContext, originOverride?: string):
328328

329329
// Build fallback candidates: origin-level sitemap first, then subpath sitemaps
330330
// when the base URL has a non-root path (e.g. swagger.io/docs/).
331+
// Both `sitemap-index.xml` (hyphen) and `sitemap_index.xml` (underscore) are
332+
// observed in the wild; e.g. Document360's CMS emits the underscore form.
331333
const fallbackOrigin = originOverride ?? ctx.origin;
332-
const candidates = [`${fallbackOrigin}/sitemap.xml`];
334+
const candidates = [
335+
`${fallbackOrigin}/sitemap.xml`,
336+
`${fallbackOrigin}/sitemap-index.xml`,
337+
`${fallbackOrigin}/sitemap_index.xml`,
338+
];
333339

334340
const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
335341
if (baseUrlPath && baseUrlPath !== '') {
336342
const subpathBase = `${fallbackOrigin}${baseUrlPath}`;
337343
candidates.push(`${subpathBase}/sitemap.xml`);
338344
candidates.push(`${subpathBase}/sitemap-index.xml`);
345+
candidates.push(`${subpathBase}/sitemap_index.xml`);
339346
}
340347

341348
return candidates;

test/helpers/mock-sitemap-not-found.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): vo
1414
const handlers = [
1515
http.get(`${parsed.origin}/robots.txt`, () => new HttpResponse('', { status: 404 })),
1616
http.get(`${parsed.origin}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
17+
http.get(`${parsed.origin}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
18+
http.get(`${parsed.origin}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
1719
];
1820
const subpath = parsed.pathname.replace(/\/$/, '');
1921
if (subpath && subpath !== '') {
@@ -26,6 +28,10 @@ export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): vo
2628
`${parsed.origin}${subpath}/sitemap-index.xml`,
2729
() => new HttpResponse('', { status: 404 }),
2830
),
31+
http.get(
32+
`${parsed.origin}${subpath}/sitemap_index.xml`,
33+
() => new HttpResponse('', { status: 404 }),
34+
),
2935
);
3036
}
3137
server.use(...handlers);

test/integration/check-pipeline.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ function setupSite(
5656
handlers.push(
5757
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
5858
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
59+
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
60+
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
5961
);
6062

6163
const defaultCacheHeaders = opts.cacheControl ? { 'Cache-Control': opts.cacheControl } : {};

test/integration/cross-check-contracts.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ describe('previousResults safety: checks handle missing dependencies gracefully'
150150
const ctx = createContext(`http://${host}`, { requestDelay: 0 });
151151
// No llms-txt-exists in previousResults, no llms.txt
152152

153+
mockSitemapNotFound(server, `http://${host}`);
153154
server.use(
154155
http.get(`http://${host}/llms.txt`, () => new HttpResponse(null, { status: 404 })),
155156
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
@@ -368,6 +369,7 @@ describe('cross-check field contracts: empty/missing upstream details', () => {
368369
details: { discoveredFiles: [] },
369370
});
370371

372+
mockSitemapNotFound(server, `http://${host}`);
371373
server.use(
372374
http.get(`http://${host}/llms.txt`, () => new HttpResponse(null, { status: 404 })),
373375
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),

test/integration/dependency-chains.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ function setupSite(
4848
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
4949
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
5050
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
51+
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
52+
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
5153
);
5254

5355
for (const page of opts.pages) {

test/integration/scoring-pipeline.test.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ function setupSite(
9090
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
9191
);
9292
}
93+
handlers.push(
94+
http.get(`http://${host}/sitemap-index.xml`, () => new HttpResponse('', { status: 404 })),
95+
http.get(`http://${host}/sitemap_index.xml`, () => new HttpResponse('', { status: 404 })),
96+
);
9397

9498
// Root URL for homepage-based discovery
9599
const pageLinks = opts.pages
@@ -415,6 +419,12 @@ describe('scoring pipeline: resolutions populated for real check failures', () =
415419
it('each failing check produces a resolution string', async () => {
416420
const { pages } = makePages(host, 6);
417421
setupSite(host, { pages, cacheControl: 'max-age=300' });
422+
// No llms.txt or sitemap → discovery falls back to baseUrl, and
423+
// markdown-url-support probes baseUrl's .md candidates.
424+
server.use(
425+
http.get(`http://${host}/.md`, () => new HttpResponse(null, { status: 404 })),
426+
http.get(`http://${host}/index.md`, () => new HttpResponse(null, { status: 404 })),
427+
);
418428

419429
const report = await runChecks(`http://${host}`, {
420430
requestDelay: 0,

test/unit/checks/content-start-position.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ describe('content-start-position', () => {
480480
// ── Fallback to baseUrl ──
481481

482482
it('falls back to baseUrl when no llms.txt', async () => {
483+
mockSitemapNotFound(server, 'http://csp-fb.local');
483484
server.use(
484485
http.get(
485486
'http://csp-fb.local/llms.txt',

test/unit/checks/llms-txt-coverage.test.ts

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { setupServer } from 'msw/node';
44
import { getCheck } from '../../../src/checks/registry.js';
55
import { createContext } from '../../../src/runner.js';
66
import type { DiscoveredFile } from '../../../src/types.js';
7+
import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js';
78
import {
89
hasLocaleCodeAt,
910
filterToUnprefixedLocale,
@@ -288,15 +289,7 @@ describe('llms-txt-coverage', () => {
288289
const host = 'cov-no-sitemap.local';
289290
const ctx = makeCtx(host, [`http://${host}/docs/page`], '/docs');
290291

291-
server.use(
292-
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
293-
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
294-
http.get(`http://${host}/docs/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
295-
http.get(
296-
`http://${host}/docs/sitemap-index.xml`,
297-
() => new HttpResponse('', { status: 404 }),
298-
),
299-
);
292+
mockSitemapNotFound(server, `http://${host}/docs`);
300293

301294
const result = await check.run(ctx);
302295
expect(result.status).toBe('skip');
@@ -527,10 +520,8 @@ describe('llms-txt-coverage', () => {
527520

528521
const ctx = makeCtx(host, docPages, '/docs');
529522

523+
mockSitemapNotFound(server, `http://${host}/docs`);
530524
server.use(
531-
// No main sitemap
532-
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
533-
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
534525
// Docs sitemap is an index
535526
http.get(
536527
`http://${host}/docs/sitemap.xml`,
@@ -548,10 +539,6 @@ describe('llms-txt-coverage', () => {
548539
headers: { 'content-type': 'application/xml' },
549540
}),
550541
),
551-
http.get(
552-
`http://${host}/docs/sitemap-index.xml`,
553-
() => new HttpResponse('', { status: 404 }),
554-
),
555542
);
556543

557544
const result = await check.run(ctx);

test/unit/checks/markdown-url-support.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ describe('markdown-url-support', () => {
603603
// false-positive the check for a /auth/index.html page.
604604
it('does not test /foo.md when /foo/index.html came from sitemap (issue #77 isolation)', async () => {
605605
const requestLog: string[] = [];
606+
mockSitemapNotFound(server, 'http://parentclean.local');
606607
server.use(
607608
http.get('http://parentclean.local/robots.txt', () => new HttpResponse('', { status: 404 })),
608609
http.get(

test/unit/checks/page-size-html.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ describe('page-size-html', () => {
308308
});
309309

310310
it('falls back to baseUrl when no llms.txt', async () => {
311+
mockSitemapNotFound(server, 'http://ps-html-fb.local');
311312
server.use(
312313
http.get(
313314
'http://ps-html-fb.local/llms.txt',

0 commit comments

Comments
 (0)