Skip to content

Commit f00cc4f

Browse files
committed
Treat www and bare-host as same origin in sitemap filtering
Sitemap entries are commonly published on the bare-host canonical (e.g. https://swift.org/...) even when the served site is www.swift.org. The strict `origin !==` comparison in shouldInclude() and scopeUrls() discarded every such URL, causing afdocs to fall back to single-page sampling and trigger the single-page-sample diagnostic. PR #82 already added the right sitemap discovery candidates, so the root sitemap was being fetched — its URLs were just being filtered out before they could be used. Fix: introduce isSameOriginIgnoringWww() (built on the existing isWwwVariant helper) and use it in both filter sites. Adds tests covering both directions of www mismatch and a regression test confirming truly cross-host URLs are still rejected. Fixes #83.
1 parent c9b6b55 commit f00cc4f

4 files changed

Lines changed: 175 additions & 2 deletions

File tree

src/checks/observability/llms-txt-coverage.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { registerCheck } from '../registry.js';
22
import {
33
getUrlsFromCachedLlmsTxtWithOmitted,
44
getUrlsFromSitemap,
5+
isSameOriginIgnoringWww,
56
parseSitemapUrls,
67
} from '../../helpers/get-page-urls.js';
78
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
@@ -297,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[
297298
return urls.filter((url) => {
298299
try {
299300
const parsed = new URL(url);
300-
if (parsed.origin !== origin) return false;
301+
if (!isSameOriginIgnoringWww(parsed.origin, origin)) return false;
301302
if (baseUrlPath && baseUrlPath !== '/') {
302303
if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) {
303304
return false;

src/helpers/get-page-urls.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ export async function getUrlsFromSitemap(
740740
function shouldInclude(url: string): boolean {
741741
try {
742742
const u = new URL(url);
743-
if (u.origin !== matchOrigin) return false;
743+
if (!isSameOriginIgnoringWww(u.origin, matchOrigin)) return false;
744744
if (prefixPath) return matchesPathPrefix(url, prefixPath);
745745
return true;
746746
} catch {
@@ -804,6 +804,25 @@ function isWwwVariant(hostname1: string, hostname2: string): boolean {
804804
return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
805805
}
806806

807+
/**
808+
* Compare two origins, treating `www.host` and `host` as equivalent.
809+
*
810+
* Sitemap entries are commonly published on the bare-host canonical
811+
* (e.g. `https://swift.org/...`) even when the served site is `www.swift.org`.
812+
* Strict origin equality would discard every such URL.
813+
*/
814+
export function isSameOriginIgnoringWww(origin1: string, origin2: string): boolean {
815+
if (origin1 === origin2) return true;
816+
try {
817+
const a = new URL(origin1);
818+
const b = new URL(origin2);
819+
if (a.protocol !== b.protocol || a.port !== b.port) return false;
820+
return isWwwVariant(a.hostname, b.hostname);
821+
} catch {
822+
return false;
823+
}
824+
}
825+
807826
/**
808827
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
809828
*

test/unit/checks/llms-txt-coverage.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,38 @@ describe('llms-txt-coverage', () => {
708708
expect(result.details?.sitemapDocPages).toBe(2);
709709
});
710710

711+
test('scopes sitemap URLs across www vs bare-host (issue #83)', async () => {
712+
// swift.org-style: scored URL is `www.host`, but sitemap entries are on bare host.
713+
// Coverage scoping must treat these as same-origin or coverage = 0%.
714+
const wwwHost = 'www.www-cov.local';
715+
const bareHost = 'www-cov.local';
716+
const llmsTxtPages = [`http://${wwwHost}/docs/intro`, `http://${wwwHost}/docs/guide`];
717+
const sitemapPages = [
718+
`http://${bareHost}/docs/intro`,
719+
`http://${bareHost}/docs/guide`,
720+
`http://${bareHost}/docs/extra`,
721+
];
722+
723+
const ctx = makeCtx(wwwHost, llmsTxtPages, '/docs');
724+
725+
server.use(
726+
http.get(
727+
`http://${wwwHost}/robots.txt`,
728+
() => new HttpResponse(`Sitemap: http://${wwwHost}/sitemap.xml`, { status: 200 }),
729+
),
730+
http.get(
731+
`http://${wwwHost}/sitemap.xml`,
732+
() =>
733+
new HttpResponse(makeSitemap(sitemapPages), {
734+
headers: { 'content-type': 'application/xml' },
735+
}),
736+
),
737+
);
738+
739+
const result = await check.run(ctx);
740+
expect(result.details?.sitemapDocPages).toBe(3);
741+
});
742+
711743
test('excludes paths relative to base URL prefix', async () => {
712744
const host = 'basepath-exclude.local';
713745
const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`];

test/unit/helpers/get-page-urls.test.ts

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import {
1414
deduplicateVersionedUrls,
1515
extractVersionFromUrl,
1616
extractLocaleFromUrl,
17+
isSameOriginIgnoringWww,
1718
} from '../../../src/helpers/get-page-urls.js';
1819
import { MAX_SITEMAP_URLS } from '../../../src/constants.js';
1920
import { createContext } from '../../../src/runner.js';
@@ -156,6 +157,33 @@ describe('filterByPathPrefix', () => {
156157
});
157158
});
158159

160+
describe('isSameOriginIgnoringWww', () => {
161+
it('returns true for identical origins', () => {
162+
expect(isSameOriginIgnoringWww('https://example.com', 'https://example.com')).toBe(true);
163+
});
164+
165+
it('returns true for www vs bare-host (issue #83)', () => {
166+
expect(isSameOriginIgnoringWww('https://swift.org', 'https://www.swift.org')).toBe(true);
167+
expect(isSameOriginIgnoringWww('https://www.swift.org', 'https://swift.org')).toBe(true);
168+
});
169+
170+
it('returns false for different protocols', () => {
171+
expect(isSameOriginIgnoringWww('http://example.com', 'https://example.com')).toBe(false);
172+
});
173+
174+
it('returns false for different ports', () => {
175+
expect(isSameOriginIgnoringWww('https://example.com:8443', 'https://example.com')).toBe(false);
176+
});
177+
178+
it('returns false for unrelated hosts', () => {
179+
expect(isSameOriginIgnoringWww('https://example.com', 'https://other.com')).toBe(false);
180+
});
181+
182+
it('returns false for subdomains that are not www (e.g. docs)', () => {
183+
expect(isSameOriginIgnoringWww('https://docs.example.com', 'https://example.com')).toBe(false);
184+
});
185+
});
186+
159187
describe('getPathFilterBase', () => {
160188
it('returns baseUrl when no effectiveOrigin is set', () => {
161189
const ctx = createContext('https://example.com/docs', { requestDelay: 0 });
@@ -1207,6 +1235,99 @@ describe('getPageUrls', () => {
12071235
]);
12081236
});
12091237

1238+
it('accepts sitemap URLs published on bare-host when scored URL has www (issue #83)', async () => {
1239+
// swift.org-style: scored URL is www.host.local, but the sitemap lists URLs
1240+
// on the bare host. Without www-equivalence in the origin filter, every URL
1241+
// is discarded and afdocs falls back to single-page sampling.
1242+
mockSitemapNotFound(server, 'http://www.www-bare.local/documentation/');
1243+
server.use(
1244+
http.get(
1245+
'http://www.www-bare.local/robots.txt',
1246+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1247+
),
1248+
http.get(
1249+
'http://www.www-bare.local/sitemap.xml',
1250+
() =>
1251+
new HttpResponse(
1252+
`<?xml version="1.0"?>
1253+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1254+
<url><loc>http://www-bare.local/documentation/intro</loc></url>
1255+
<url><loc>http://www-bare.local/documentation/guide</loc></url>
1256+
</urlset>`,
1257+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1258+
),
1259+
),
1260+
);
1261+
1262+
const ctx = createContext('http://www.www-bare.local/documentation/', { requestDelay: 0 });
1263+
const warnings: string[] = [];
1264+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1265+
expect(result).toEqual([
1266+
'http://www-bare.local/documentation/intro',
1267+
'http://www-bare.local/documentation/guide',
1268+
]);
1269+
});
1270+
1271+
it('accepts sitemap URLs published on www-host when scored URL is bare (issue #83)', async () => {
1272+
// Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed.
1273+
mockSitemapNotFound(server, 'http://bare-www.local');
1274+
server.use(
1275+
http.get(
1276+
'http://bare-www.local/robots.txt',
1277+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1278+
),
1279+
http.get(
1280+
'http://bare-www.local/sitemap.xml',
1281+
() =>
1282+
new HttpResponse(
1283+
`<?xml version="1.0"?>
1284+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1285+
<url><loc>http://www.bare-www.local/page-1</loc></url>
1286+
<url><loc>http://www.bare-www.local/page-2</loc></url>
1287+
</urlset>`,
1288+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1289+
),
1290+
),
1291+
);
1292+
1293+
const ctx = createContext('http://bare-www.local', { requestDelay: 0 });
1294+
const warnings: string[] = [];
1295+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1296+
expect(result).toEqual([
1297+
'http://www.bare-www.local/page-1',
1298+
'http://www.bare-www.local/page-2',
1299+
]);
1300+
});
1301+
1302+
it('still rejects truly cross-host sitemap URLs', async () => {
1303+
// Sanity check: www-equivalence does not relax filtering for unrelated hosts.
1304+
mockSitemapNotFound(server, 'http://strict-host.local');
1305+
server.use(
1306+
http.get(
1307+
'http://strict-host.local/robots.txt',
1308+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1309+
),
1310+
http.get(
1311+
'http://strict-host.local/sitemap.xml',
1312+
() =>
1313+
new HttpResponse(
1314+
`<?xml version="1.0"?>
1315+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1316+
<url><loc>http://strict-host.local/keep</loc></url>
1317+
<url><loc>http://other-host.local/drop</loc></url>
1318+
<url><loc>https://strict-host.local/drop-scheme</loc></url>
1319+
</urlset>`,
1320+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1321+
),
1322+
),
1323+
);
1324+
1325+
const ctx = createContext('http://strict-host.local', { requestDelay: 0 });
1326+
const warnings: string[] = [];
1327+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1328+
expect(result).toEqual(['http://strict-host.local/keep']);
1329+
});
1330+
12101331
it('warns and skips gzipped sitemap from robots.txt', async () => {
12111332
server.use(
12121333
http.get(

0 commit comments

Comments
 (0)