Skip to content

Commit 639c1cb

Browse files
committed
Consolidate www-equivalence into a single isSameSite predicate
Four call sites had grown independent www-handling implementations (stripWww in to-md-urls, isWwwVariant + isSameOriginIgnoringWww in get-page-urls, ad-hoc two-origin checks in walkAggregateLinks). Each inlined its own scheme/port strictness, leaving the rule split across files with no single source of truth — adding a new "same site" tweak required remembering to update every site. Replace all four with one predicate: isSameSite(url1, url2). Same canonical-host comparison everywhere, scheme deliberately ignored (http→https on the same host is a canonical upgrade), port-strict. Behavior changes (both correctness improvements): - getPathFilterBase now preserves the base path when origins differ only by scheme, not just www. Previously dropped to root. - shouldInclude / scopeUrls now accept sitemap URLs with mismatched scheme. Real sitemaps occasionally have stale http entries; they resolve fine after the redirect. walkAggregateLinks still applies isSameSite twice — once against ctx.origin and once against the effective origin — because true cross-host redirects (e.g. example.com → docs.example.com) leave content discoverable at two genuinely-different origins. Net: 50 lines removed, one shared module, one rule to update.
1 parent f00cc4f commit 639c1cb

6 files changed

Lines changed: 118 additions & 85 deletions

File tree

src/checks/observability/llms-txt-coverage.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ import { registerCheck } from '../registry.js';
22
import {
33
getUrlsFromCachedLlmsTxtWithOmitted,
44
getUrlsFromSitemap,
5-
isSameOriginIgnoringWww,
65
parseSitemapUrls,
76
} from '../../helpers/get-page-urls.js';
7+
import { isSameSite } from '../../helpers/host-equivalence.js';
88
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
99
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
1010
import {
@@ -298,7 +298,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[
298298
return urls.filter((url) => {
299299
try {
300300
const parsed = new URL(url);
301-
if (!isSameOriginIgnoringWww(parsed.origin, origin)) return false;
301+
if (!isSameSite(url, origin)) return false;
302302
if (baseUrlPath && baseUrlPath !== '/') {
303303
if (!parsed.pathname.startsWith(baseUrlPath + '/') && parsed.pathname !== baseUrlPath) {
304304
return false;

src/helpers/get-page-urls.ts

Lines changed: 17 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
22
import { MAX_SITEMAP_URLS } from '../constants.js';
33
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
44
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
5+
import { isSameSite } from './host-equivalence.js';
56
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
67
import type { CheckContext, DiscoveredFile } from '../types.js';
78

@@ -162,17 +163,19 @@ async function walkAggregateLinksWithOriginals(
162163
const omittedTxtUrls: string[] = [];
163164

164165
const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
166+
const isAcceptedOrigin = (url: string): boolean =>
167+
isSameSite(url, ctx.origin) || isSameSite(url, siteOrigin);
165168

166169
for (const entry of entries) {
167170
try {
168171
const parsed = new URL(entry.url);
169172
if (/\.txt$/i.test(parsed.pathname)) {
170173
// .txt files are either aggregate indexes to walk (same origin)
171174
// or external resources to skip — never page URLs themselves
172-
if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
175+
if (isAcceptedOrigin(entry.url)) {
173176
aggregateUrls.push(entry.url);
174177
}
175-
} else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
178+
} else if (isAcceptedOrigin(entry.url)) {
176179
// Only include same-origin page URLs; cross-origin links are
177180
// external resources the site owner doesn't control.
178181
pageUrls.push(entry);
@@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals(
207210
for (const subEntry of subEntries) {
208211
try {
209212
const parsed = new URL(subEntry.url);
210-
const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin;
211-
if (!isSameOrigin) continue;
213+
if (!isAcceptedOrigin(subEntry.url)) continue;
212214

213215
if (/\.txt$/i.test(parsed.pathname)) {
214216
// Depth-1 .txt link: record as omitted rather than descending
@@ -739,13 +741,13 @@ export async function getUrlsFromSitemap(
739741

740742
function shouldInclude(url: string): boolean {
741743
try {
742-
const u = new URL(url);
743-
if (!isSameOriginIgnoringWww(u.origin, matchOrigin)) return false;
744-
if (prefixPath) return matchesPathPrefix(url, prefixPath);
745-
return true;
744+
new URL(url);
746745
} catch {
747746
return false;
748747
}
748+
if (!isSameSite(url, matchOrigin)) return false;
749+
if (prefixPath) return matchesPathPrefix(url, prefixPath);
750+
return true;
749751
}
750752

751753
// Collect up to collectLimit URLs before refinement. The cap is applied
@@ -800,54 +802,29 @@ export async function getUrlsFromSitemap(
800802
return deduplicated.slice(0, maxUrls);
801803
}
802804

803-
function isWwwVariant(hostname1: string, hostname2: string): boolean {
804-
return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
805-
}
806-
807-
/**
808-
* Compare two origins, treating `www.host` and `host` as equivalent.
809-
*
810-
* Sitemap entries are commonly published on the bare-host canonical
811-
* (e.g. `https://swift.org/...`) even when the served site is `www.swift.org`.
812-
* Strict origin equality would discard every such URL.
813-
*/
814-
export function isSameOriginIgnoringWww(origin1: string, origin2: string): boolean {
815-
if (origin1 === origin2) return true;
816-
try {
817-
const a = new URL(origin1);
818-
const b = new URL(origin2);
819-
if (a.protocol !== b.protocol || a.port !== b.port) return false;
820-
return isWwwVariant(a.hostname, b.hostname);
821-
} catch {
822-
return false;
823-
}
824-
}
825-
826805
/**
827806
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
828807
*
829808
* When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
830809
* the original baseUrl path doesn't apply to the redirected host, so we return the
831810
* effectiveOrigin (a root URL) which makes path filtering a no-op.
832811
*
833-
* When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
834-
* the path structure is preserved, so we transfer the baseUrl's path to the
835-
* effective origin to keep path-prefix filtering active.
812+
* When the redirect stays on the same site (e.g. www-canonicalization or an
813+
* http→https upgrade), the path structure is preserved, so we transfer the
814+
* baseUrl's path to the effective origin to keep path-prefix filtering active.
836815
*/
837816
export function getPathFilterBase(ctx: CheckContext): string {
838817
if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) {
839818
return ctx.baseUrl;
840819
}
841820

842-
try {
843-
const originalHost = new URL(ctx.origin).hostname;
844-
const effectiveHost = new URL(ctx.effectiveOrigin).hostname;
845-
if (isWwwVariant(originalHost, effectiveHost)) {
821+
if (isSameSite(ctx.origin, ctx.effectiveOrigin)) {
822+
try {
846823
const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
847824
return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin;
825+
} catch {
826+
// fall through
848827
}
849-
} catch {
850-
// fall through
851828
}
852829

853830
return ctx.effectiveOrigin;

src/helpers/host-equivalence.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/**
2+
* Host equivalence: treat `www.host` and `host` as the same site.
3+
*
4+
* Documentation sites mix the two forms in several ways that all need the
5+
* same treatment: redirect classification, sitemap URL filtering, path-filter
6+
* base derivation, and aggregate-link walking. Keeping the rule in one place
7+
* means future tweaks (e.g. recognizing additional canonical prefixes)
8+
* propagate to every site automatically.
9+
*/
10+
11+
/**
12+
* Strip a leading `www.` from a hostname, if present.
13+
*/
14+
export function canonicalHost(host: string): string {
15+
return host.startsWith('www.') ? host.slice(4) : host;
16+
}
17+
18+
/**
19+
* True when two URLs (or origins) represent the same site: same hostname after
20+
* stripping `www.`, and same port. Schemes are deliberately ignored so that
21+
* the canonical http→https upgrade on the same host is not classified as a
22+
* different site.
23+
*/
24+
export function isSameSite(url1: string, url2: string): boolean {
25+
try {
26+
const a = new URL(url1);
27+
const b = new URL(url2);
28+
return a.port === b.port && canonicalHost(a.hostname) === canonicalHost(b.hostname);
29+
} catch {
30+
return false;
31+
}
32+
}

src/helpers/to-md-urls.ts

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,21 @@
1-
/**
2-
* Strip the leading "www." from a hostname, if present.
3-
*/
4-
function stripWww(host: string): string {
5-
return host.startsWith('www.') ? host.slice(4) : host;
6-
}
1+
import { isSameSite } from './host-equivalence.js';
72

83
/**
94
* Returns true if the two URLs have different hosts (i.e. a cross-host redirect).
105
* A www ↔ bare-domain redirect (e.g. mongodb.com → www.mongodb.com) is NOT
116
* considered cross-host because every HTTP client and agent follows it.
7+
*
8+
* Returns false for malformed URLs — when we can't classify, default to
9+
* "not cross-host" so we don't penalize on bad inputs.
1210
*/
1311
export function isCrossHostRedirect(originalUrl: string, finalUrl: string): boolean {
1412
try {
15-
const original = new URL(originalUrl);
16-
const final_ = new URL(finalUrl);
17-
if (original.host === final_.host) return false;
18-
// www ↔ bare-domain is same-site, not cross-host
19-
return stripWww(original.host) !== stripWww(final_.host);
13+
new URL(originalUrl);
14+
new URL(finalUrl);
2015
} catch {
2116
return false;
2217
}
18+
return !isSameSite(originalUrl, finalUrl);
2319
}
2420

2521
/**

test/unit/helpers/get-page-urls.test.ts

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import {
1414
deduplicateVersionedUrls,
1515
extractVersionFromUrl,
1616
extractLocaleFromUrl,
17-
isSameOriginIgnoringWww,
1817
} from '../../../src/helpers/get-page-urls.js';
1918
import { MAX_SITEMAP_URLS } from '../../../src/constants.js';
2019
import { createContext } from '../../../src/runner.js';
@@ -157,33 +156,6 @@ describe('filterByPathPrefix', () => {
157156
});
158157
});
159158

160-
describe('isSameOriginIgnoringWww', () => {
161-
it('returns true for identical origins', () => {
162-
expect(isSameOriginIgnoringWww('https://example.com', 'https://example.com')).toBe(true);
163-
});
164-
165-
it('returns true for www vs bare-host (issue #83)', () => {
166-
expect(isSameOriginIgnoringWww('https://swift.org', 'https://www.swift.org')).toBe(true);
167-
expect(isSameOriginIgnoringWww('https://www.swift.org', 'https://swift.org')).toBe(true);
168-
});
169-
170-
it('returns false for different protocols', () => {
171-
expect(isSameOriginIgnoringWww('http://example.com', 'https://example.com')).toBe(false);
172-
});
173-
174-
it('returns false for different ports', () => {
175-
expect(isSameOriginIgnoringWww('https://example.com:8443', 'https://example.com')).toBe(false);
176-
});
177-
178-
it('returns false for unrelated hosts', () => {
179-
expect(isSameOriginIgnoringWww('https://example.com', 'https://other.com')).toBe(false);
180-
});
181-
182-
it('returns false for subdomains that are not www (e.g. docs)', () => {
183-
expect(isSameOriginIgnoringWww('https://docs.example.com', 'https://example.com')).toBe(false);
184-
});
185-
});
186-
187159
describe('getPathFilterBase', () => {
188160
it('returns baseUrl when no effectiveOrigin is set', () => {
189161
const ctx = createContext('https://example.com/docs', { requestDelay: 0 });
@@ -1299,8 +1271,10 @@ describe('getPageUrls', () => {
12991271
]);
13001272
});
13011273

1302-
it('still rejects truly cross-host sitemap URLs', async () => {
1303-
// Sanity check: www-equivalence does not relax filtering for unrelated hosts.
1274+
it('still rejects truly cross-host sitemap URLs (but allows scheme upgrade)', async () => {
1275+
// www-equivalence does not relax filtering for unrelated hosts. Scheme
1276+
// is intentionally ignored: an http→https sitemap entry resolves to the
1277+
// same site after the canonical scheme upgrade.
13041278
mockSitemapNotFound(server, 'http://strict-host.local');
13051279
server.use(
13061280
http.get(
@@ -1315,7 +1289,7 @@ describe('getPageUrls', () => {
13151289
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
13161290
<url><loc>http://strict-host.local/keep</loc></url>
13171291
<url><loc>http://other-host.local/drop</loc></url>
1318-
<url><loc>https://strict-host.local/drop-scheme</loc></url>
1292+
<url><loc>https://strict-host.local/keep-scheme</loc></url>
13191293
</urlset>`,
13201294
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
13211295
),
@@ -1325,7 +1299,10 @@ describe('getPageUrls', () => {
13251299
const ctx = createContext('http://strict-host.local', { requestDelay: 0 });
13261300
const warnings: string[] = [];
13271301
const result = await getUrlsFromSitemap(ctx, warnings, { skipRefinement: true });
1328-
expect(result).toEqual(['http://strict-host.local/keep']);
1302+
expect(result).toEqual([
1303+
'http://strict-host.local/keep',
1304+
'https://strict-host.local/keep-scheme',
1305+
]);
13291306
});
13301307

13311308
it('warns and skips gzipped sitemap from robots.txt', async () => {
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { canonicalHost, isSameSite } from '../../../src/helpers/host-equivalence.js';
3+
4+
describe('canonicalHost', () => {
5+
it('strips a leading www.', () => {
6+
expect(canonicalHost('www.swift.org')).toBe('swift.org');
7+
});
8+
9+
it('leaves bare hosts unchanged', () => {
10+
expect(canonicalHost('swift.org')).toBe('swift.org');
11+
});
12+
13+
it('only strips a leading www., not interior', () => {
14+
expect(canonicalHost('docs.www.example.com')).toBe('docs.www.example.com');
15+
});
16+
});
17+
18+
describe('isSameSite', () => {
19+
it('returns true for identical URLs', () => {
20+
expect(isSameSite('https://example.com/', 'https://example.com/')).toBe(true);
21+
});
22+
23+
it('returns true for www vs bare-host (issue #83)', () => {
24+
expect(isSameSite('https://swift.org/x', 'https://www.swift.org/y')).toBe(true);
25+
expect(isSameSite('https://www.swift.org/x', 'https://swift.org/y')).toBe(true);
26+
});
27+
28+
it('ignores scheme — http→https on the same host is same site', () => {
29+
expect(isSameSite('http://example.com/x', 'https://example.com/x')).toBe(true);
30+
});
31+
32+
it('ignores path, query, and fragment', () => {
33+
expect(isSameSite('https://example.com/a?q=1#x', 'https://example.com/b')).toBe(true);
34+
});
35+
36+
it('returns false for different ports', () => {
37+
expect(isSameSite('https://example.com:8443/', 'https://example.com/')).toBe(false);
38+
});
39+
40+
it('returns false for unrelated hosts', () => {
41+
expect(isSameSite('https://example.com/', 'https://other.com/')).toBe(false);
42+
});
43+
44+
it('returns false for non-www subdomains (e.g. docs)', () => {
45+
expect(isSameSite('https://docs.example.com/', 'https://example.com/')).toBe(false);
46+
});
47+
48+
it('returns false for malformed URLs', () => {
49+
expect(isSameSite('not-a-url', 'https://example.com/')).toBe(false);
50+
});
51+
});

0 commit comments

Comments
 (0)