Skip to content

Commit 747cb70

Browse files
authored
Merge pull request #84 from agent-ecosystem/fix/sitemap-www-host-equivalence
Consolidate www-equivalence; fix sitemap filtering for swift.org-style hosts
2 parents c9b6b55 + 639c1cb commit 747cb70

7 files changed

Lines changed: 239 additions & 33 deletions

File tree

src/checks/observability/llms-txt-coverage.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
getUrlsFromSitemap,
55
parseSitemapUrls,
66
} from '../../helpers/get-page-urls.js';
7+
import { isSameSite } from '../../helpers/host-equivalence.js';
78
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
89
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
910
import {
@@ -297,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[
297298
return urls.filter((url) => {
298299
try {
299300
const parsed = new URL(url);
300-
if (parsed.origin !== origin) return false;
301+
if (!isSameSite(url, origin)) return false;
301302
if (baseUrlPath && baseUrlPath !== '/') {
302303
if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) {
303304
return false;

src/helpers/get-page-urls.ts

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
22
import { MAX_SITEMAP_URLS } from '../constants.js';
33
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
44
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
5+
import { isSameSite } from './host-equivalence.js';
56
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
67
import type { CheckContext, DiscoveredFile } from '../types.js';
78

@@ -162,17 +163,19 @@ async function walkAggregateLinksWithOriginals(
162163
const omittedTxtUrls: string[] = [];
163164

164165
const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
166+
const isAcceptedOrigin = (url: string): boolean =>
167+
isSameSite(url, ctx.origin) || isSameSite(url, siteOrigin);
165168

166169
for (const entry of entries) {
167170
try {
168171
const parsed = new URL(entry.url);
169172
if (/\.txt$/i.test(parsed.pathname)) {
170173
// .txt files are either aggregate indexes to walk (same origin)
171174
// or external resources to skip — never page URLs themselves
172-
if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
175+
if (isAcceptedOrigin(entry.url)) {
173176
aggregateUrls.push(entry.url);
174177
}
175-
} else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
178+
} else if (isAcceptedOrigin(entry.url)) {
176179
// Only include same-origin page URLs; cross-origin links are
177180
// external resources the site owner doesn't control.
178181
pageUrls.push(entry);
@@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals(
207210
for (const subEntry of subEntries) {
208211
try {
209212
const parsed = new URL(subEntry.url);
210-
const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin;
211-
if (!isSameOrigin) continue;
213+
if (!isAcceptedOrigin(subEntry.url)) continue;
212214

213215
if (/\.txt$/i.test(parsed.pathname)) {
214216
// Depth-1 .txt link: record as omitted rather than descending
@@ -739,13 +741,13 @@ export async function getUrlsFromSitemap(
739741

740742
function shouldInclude(url: string): boolean {
741743
try {
742-
const u = new URL(url);
743-
if (u.origin !== matchOrigin) return false;
744-
if (prefixPath) return matchesPathPrefix(url, prefixPath);
745-
return true;
744+
new URL(url);
746745
} catch {
747746
return false;
748747
}
748+
if (!isSameSite(url, matchOrigin)) return false;
749+
if (prefixPath) return matchesPathPrefix(url, prefixPath);
750+
return true;
749751
}
750752

751753
// Collect up to collectLimit URLs before refinement. The cap is applied
@@ -800,35 +802,29 @@ export async function getUrlsFromSitemap(
800802
return deduplicated.slice(0, maxUrls);
801803
}
802804

803-
function isWwwVariant(hostname1: string, hostname2: string): boolean {
804-
return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
805-
}
806-
807805
/**
808806
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
809807
*
810808
* When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
811809
* the original baseUrl path doesn't apply to the redirected host, so we return the
812810
* effectiveOrigin (a root URL) which makes path filtering a no-op.
813811
*
814-
* When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
815-
* the path structure is preserved, so we transfer the baseUrl's path to the
816-
* effective origin to keep path-prefix filtering active.
812+
* When the redirect stays on the same site (e.g. www-canonicalization or an
813+
* http→https upgrade), the path structure is preserved, so we transfer the
814+
* baseUrl's path to the effective origin to keep path-prefix filtering active.
817815
*/
818816
export function getPathFilterBase(ctx: CheckContext): string {
819817
if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) {
820818
return ctx.baseUrl;
821819
}
822820

823-
try {
824-
const originalHost = new URL(ctx.origin).hostname;
825-
const effectiveHost = new URL(ctx.effectiveOrigin).hostname;
826-
if (isWwwVariant(originalHost, effectiveHost)) {
821+
if (isSameSite(ctx.origin, ctx.effectiveOrigin)) {
822+
try {
827823
const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
828824
return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin;
825+
} catch {
826+
// fall through
829827
}
830-
} catch {
831-
// fall through
832828
}
833829

834830
return ctx.effectiveOrigin;

src/helpers/host-equivalence.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/**
2+
* Host equivalence: treat `www.host` and `host` as the same site.
3+
*
4+
* Documentation sites mix the two forms in several ways that all need the
5+
* same treatment: redirect classification, sitemap URL filtering, path-filter
6+
* base derivation, and aggregate-link walking. Keeping the rule in one place
7+
* means future tweaks (e.g. recognizing additional canonical prefixes)
8+
* propagate to every site automatically.
9+
*/
10+
11+
/**
12+
* Strip a leading `www.` from a hostname, if present.
13+
*/
14+
export function canonicalHost(host: string): string {
15+
return host.startsWith('www.') ? host.slice(4) : host;
16+
}
17+
18+
/**
19+
* True when two URLs (or origins) represent the same site: same hostname after
20+
* stripping `www.`, and same port. Schemes are deliberately ignored so that
21+
* the canonical http→https upgrade on the same host is not classified as a
22+
* different site.
23+
*/
24+
export function isSameSite(url1: string, url2: string): boolean {
25+
try {
26+
const a = new URL(url1);
27+
const b = new URL(url2);
28+
return a.port === b.port && canonicalHost(a.hostname) === canonicalHost(b.hostname);
29+
} catch {
30+
return false;
31+
}
32+
}

src/helpers/to-md-urls.ts

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,21 @@
1-
/**
2-
* Strip the leading "www." from a hostname, if present.
3-
*/
4-
function stripWww(host: string): string {
5-
return host.startsWith('www.') ? host.slice(4) : host;
6-
}
1+
import { isSameSite } from './host-equivalence.js';
72

83
/**
94
* Returns true if the two URLs have different hosts (i.e. a cross-host redirect).
105
* A www ↔ bare-domain redirect (e.g. mongodb.com → www.mongodb.com) is NOT
116
* considered cross-host because every HTTP client and agent follows it.
7+
*
8+
* Returns false for malformed URLs — when we can't classify, default to
9+
* "not cross-host" so we don't penalize on bad inputs.
1210
*/
1311
export function isCrossHostRedirect(originalUrl: string, finalUrl: string): boolean {
1412
try {
15-
const original = new URL(originalUrl);
16-
const final_ = new URL(finalUrl);
17-
if (original.host === final_.host) return false;
18-
// www ↔ bare-domain is same-site, not cross-host
19-
return stripWww(original.host) !== stripWww(final_.host);
13+
new URL(originalUrl);
14+
new URL(finalUrl);
2015
} catch {
2116
return false;
2217
}
18+
return !isSameSite(originalUrl, finalUrl);
2319
}
2420

2521
/**

test/unit/checks/llms-txt-coverage.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,38 @@ describe('llms-txt-coverage', () => {
708708
expect(result.details?.sitemapDocPages).toBe(2);
709709
});
710710

711+
test('scopes sitemap URLs across www vs bare-host (issue #83)', async () => {
712+
// swift.org-style: scored URL is `www.host`, but sitemap entries are on bare host.
713+
// Coverage scoping must treat these as same-origin or coverage = 0%.
714+
const wwwHost = 'www.www-cov.local';
715+
const bareHost = 'www-cov.local';
716+
const llmsTxtPages = [`http://${wwwHost}/docs/intro`, `http://${wwwHost}/docs/guide`];
717+
const sitemapPages = [
718+
`http://${bareHost}/docs/intro`,
719+
`http://${bareHost}/docs/guide`,
720+
`http://${bareHost}/docs/extra`,
721+
];
722+
723+
const ctx = makeCtx(wwwHost, llmsTxtPages, '/docs');
724+
725+
server.use(
726+
http.get(
727+
`http://${wwwHost}/robots.txt`,
728+
() => new HttpResponse(`Sitemap: http://${wwwHost}/sitemap.xml`, { status: 200 }),
729+
),
730+
http.get(
731+
`http://${wwwHost}/sitemap.xml`,
732+
() =>
733+
new HttpResponse(makeSitemap(sitemapPages), {
734+
headers: { 'content-type': 'application/xml' },
735+
}),
736+
),
737+
);
738+
739+
const result = await check.run(ctx);
740+
expect(result.details?.sitemapDocPages).toBe(3);
741+
});
742+
711743
test('excludes paths relative to base URL prefix', async () => {
712744
const host = 'basepath-exclude.local';
713745
const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`];

test/unit/helpers/get-page-urls.test.ts

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,104 @@ describe('getPageUrls', () => {
12071207
]);
12081208
});
12091209

1210+
it('accepts sitemap URLs published on bare-host when scored URL has www (issue #83)', async () => {
1211+
// swift.org-style: scored URL is www.host.local, but the sitemap lists URLs
1212+
// on the bare host. Without www-equivalence in the origin filter, every URL
1213+
// is discarded and afdocs falls back to single-page sampling.
1214+
mockSitemapNotFound(server, 'http://www.www-bare.local/documentation/');
1215+
server.use(
1216+
http.get(
1217+
'http://www.www-bare.local/robots.txt',
1218+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1219+
),
1220+
http.get(
1221+
'http://www.www-bare.local/sitemap.xml',
1222+
() =>
1223+
new HttpResponse(
1224+
`<?xml version="1.0"?>
1225+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1226+
<url><loc>http://www-bare.local/documentation/intro</loc></url>
1227+
<url><loc>http://www-bare.local/documentation/guide</loc></url>
1228+
</urlset>`,
1229+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1230+
),
1231+
),
1232+
);
1233+
1234+
const ctx = createContext('http://www.www-bare.local/documentation/', { requestDelay: 0 });
1235+
const warnings: string[] = [];
1236+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1237+
expect(result).toEqual([
1238+
'http://www-bare.local/documentation/intro',
1239+
'http://www-bare.local/documentation/guide',
1240+
]);
1241+
});
1242+
1243+
it('accepts sitemap URLs published on www-host when scored URL is bare (issue #83)', async () => {
1244+
// Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed.
1245+
mockSitemapNotFound(server, 'http://bare-www.local');
1246+
server.use(
1247+
http.get(
1248+
'http://bare-www.local/robots.txt',
1249+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1250+
),
1251+
http.get(
1252+
'http://bare-www.local/sitemap.xml',
1253+
() =>
1254+
new HttpResponse(
1255+
`<?xml version="1.0"?>
1256+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1257+
<url><loc>http://www.bare-www.local/page-1</loc></url>
1258+
<url><loc>http://www.bare-www.local/page-2</loc></url>
1259+
</urlset>`,
1260+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1261+
),
1262+
),
1263+
);
1264+
1265+
const ctx = createContext('http://bare-www.local', { requestDelay: 0 });
1266+
const warnings: string[] = [];
1267+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1268+
expect(result).toEqual([
1269+
'http://www.bare-www.local/page-1',
1270+
'http://www.bare-www.local/page-2',
1271+
]);
1272+
});
1273+
1274+
it('still rejects truly cross-host sitemap URLs (but allows scheme upgrade)', async () => {
1275+
// www-equivalence does not relax filtering for unrelated hosts. Scheme
1276+
// is intentionally ignored: an http→https sitemap entry resolves to the
1277+
// same site after the canonical scheme upgrade.
1278+
mockSitemapNotFound(server, 'http://strict-host.local');
1279+
server.use(
1280+
http.get(
1281+
'http://strict-host.local/robots.txt',
1282+
() => new HttpResponse('User-agent: *\n', { status: 200 }),
1283+
),
1284+
http.get(
1285+
'http://strict-host.local/sitemap.xml',
1286+
() =>
1287+
new HttpResponse(
1288+
`<?xml version="1.0"?>
1289+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1290+
<url><loc>http://strict-host.local/keep</loc></url>
1291+
<url><loc>http://other-host.local/drop</loc></url>
1292+
<url><loc>https://strict-host.local/keep-scheme</loc></url>
1293+
</urlset>`,
1294+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1295+
),
1296+
),
1297+
);
1298+
1299+
const ctx = createContext('http://strict-host.local', { requestDelay: 0 });
1300+
const warnings: string[] = [];
1301+
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1302+
expect(result).toEqual([
1303+
'http://strict-host.local/keep',
1304+
'https://strict-host.local/keep-scheme',
1305+
]);
1306+
});
1307+
12101308
it('warns and skips gzipped sitemap from robots.txt', async () => {
12111309
server.use(
12121310
http.get(
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { canonicalHost, isSameSite } from '../../../src/helpers/host-equivalence.js';
3+
4+
describe('canonicalHost', () => {
5+
it('strips a leading www.', () => {
6+
expect(canonicalHost('www.swift.org')).toBe('swift.org');
7+
});
8+
9+
it('leaves bare hosts unchanged', () => {
10+
expect(canonicalHost('swift.org')).toBe('swift.org');
11+
});
12+
13+
it('only strips a leading www., not interior', () => {
14+
expect(canonicalHost('docs.www.example.com')).toBe('docs.www.example.com');
15+
});
16+
});
17+
18+
describe('isSameSite', () => {
19+
it('returns true for identical URLs', () => {
20+
expect(isSameSite('https://example.com/', 'https://example.com/')).toBe(true);
21+
});
22+
23+
it('returns true for www vs bare-host (issue #83)', () => {
24+
expect(isSameSite('https://swift.org/x', 'https://www.swift.org/y')).toBe(true);
25+
expect(isSameSite('https://www.swift.org/x', 'https://swift.org/y')).toBe(true);
26+
});
27+
28+
it('ignores scheme — http→https on the same host is same site', () => {
29+
expect(isSameSite('http://example.com/x', 'https://example.com/x')).toBe(true);
30+
});
31+
32+
it('ignores path, query, and fragment', () => {
33+
expect(isSameSite('https://example.com/a?q=1#x', 'https://example.com/b')).toBe(true);
34+
});
35+
36+
it('returns false for different ports', () => {
37+
expect(isSameSite('https://example.com:8443/', 'https://example.com/')).toBe(false);
38+
});
39+
40+
it('returns false for unrelated hosts', () => {
41+
expect(isSameSite('https://example.com/', 'https://other.com/')).toBe(false);
42+
});
43+
44+
it('returns false for non-www subdomains (e.g. docs)', () => {
45+
expect(isSameSite('https://docs.example.com/', 'https://example.com/')).toBe(false);
46+
});
47+
48+
it('returns false for malformed URLs', () => {
49+
expect(isSameSite('not-a-url', 'https://example.com/')).toBe(false);
50+
});
51+
});

0 commit comments

Comments
 (0)