@@ -14,6 +14,7 @@ import {
1414 deduplicateVersionedUrls ,
1515 extractVersionFromUrl ,
1616 extractLocaleFromUrl ,
17+ isSameOriginIgnoringWww ,
1718} from '../../../src/helpers/get-page-urls.js' ;
1819import { MAX_SITEMAP_URLS } from '../../../src/constants.js' ;
1920import { createContext } from '../../../src/runner.js' ;
@@ -156,6 +157,33 @@ describe('filterByPathPrefix', () => {
156157 } ) ;
157158} ) ;
158159
160+ describe ( 'isSameOriginIgnoringWww' , ( ) => {
161+ it ( 'returns true for identical origins' , ( ) => {
162+ expect ( isSameOriginIgnoringWww ( 'https://example.com' , 'https://example.com' ) ) . toBe ( true ) ;
163+ } ) ;
164+
165+ it ( 'returns true for www vs bare-host (issue #83)' , ( ) => {
166+ expect ( isSameOriginIgnoringWww ( 'https://swift.org' , 'https://www.swift.org' ) ) . toBe ( true ) ;
167+ expect ( isSameOriginIgnoringWww ( 'https://www.swift.org' , 'https://swift.org' ) ) . toBe ( true ) ;
168+ } ) ;
169+
170+ it ( 'returns false for different protocols' , ( ) => {
171+ expect ( isSameOriginIgnoringWww ( 'http://example.com' , 'https://example.com' ) ) . toBe ( false ) ;
172+ } ) ;
173+
174+ it ( 'returns false for different ports' , ( ) => {
175+ expect ( isSameOriginIgnoringWww ( 'https://example.com:8443' , 'https://example.com' ) ) . toBe ( false ) ;
176+ } ) ;
177+
178+ it ( 'returns false for unrelated hosts' , ( ) => {
179+ expect ( isSameOriginIgnoringWww ( 'https://example.com' , 'https://other.com' ) ) . toBe ( false ) ;
180+ } ) ;
181+
182+ it ( 'returns false for subdomains that are not www (e.g. docs)' , ( ) => {
183+ expect ( isSameOriginIgnoringWww ( 'https://docs.example.com' , 'https://example.com' ) ) . toBe ( false ) ;
184+ } ) ;
185+ } ) ;
186+
159187describe ( 'getPathFilterBase' , ( ) => {
160188 it ( 'returns baseUrl when no effectiveOrigin is set' , ( ) => {
161189 const ctx = createContext ( 'https://example.com/docs' , { requestDelay : 0 } ) ;
@@ -1207,6 +1235,99 @@ describe('getPageUrls', () => {
12071235 ] ) ;
12081236 } ) ;
12091237
1238+ it ( 'accepts sitemap URLs published on bare-host when scored URL has www (issue #83)' , async ( ) => {
1239+ // swift.org-style: scored URL is www.host.local, but the sitemap lists URLs
1240+ // on the bare host. Without www-equivalence in the origin filter, every URL
1241+ // is discarded and afdocs falls back to single-page sampling.
1242+ mockSitemapNotFound ( server , 'http://www.www-bare.local/documentation/' ) ;
1243+ server . use (
1244+ http . get (
1245+ 'http://www.www-bare.local/robots.txt' ,
1246+ ( ) => new HttpResponse ( 'User-agent: *\n' , { status : 200 } ) ,
1247+ ) ,
1248+ http . get (
1249+ 'http://www.www-bare.local/sitemap.xml' ,
1250+ ( ) =>
1251+ new HttpResponse (
1252+ `<?xml version="1.0"?>
1253+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1254+ <url><loc>http://www-bare.local/documentation/intro</loc></url>
1255+ <url><loc>http://www-bare.local/documentation/guide</loc></url>
1256+ </urlset>` ,
1257+ { status : 200 , headers : { 'Content-Type' : 'application/xml' } } ,
1258+ ) ,
1259+ ) ,
1260+ ) ;
1261+
1262+ const ctx = createContext ( 'http://www.www-bare.local/documentation/' , { requestDelay : 0 } ) ;
1263+ const warnings : string [ ] = [ ] ;
1264+ const result = await getUrlsFromSitemap ( ctx , warnings , { skipRefinement : true } ) ;
1265+ expect ( result ) . toEqual ( [
1266+ 'http://www-bare.local/documentation/intro' ,
1267+ 'http://www-bare.local/documentation/guide' ,
1268+ ] ) ;
1269+ } ) ;
1270+
1271+ it ( 'accepts sitemap URLs published on www-host when scored URL is bare (issue #83)' , async ( ) => {
1272+ // Inverse scenario: scored URL is bare host, sitemap entries are www-prefixed.
1273+ mockSitemapNotFound ( server , 'http://bare-www.local' ) ;
1274+ server . use (
1275+ http . get (
1276+ 'http://bare-www.local/robots.txt' ,
1277+ ( ) => new HttpResponse ( 'User-agent: *\n' , { status : 200 } ) ,
1278+ ) ,
1279+ http . get (
1280+ 'http://bare-www.local/sitemap.xml' ,
1281+ ( ) =>
1282+ new HttpResponse (
1283+ `<?xml version="1.0"?>
1284+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1285+ <url><loc>http://www.bare-www.local/page-1</loc></url>
1286+ <url><loc>http://www.bare-www.local/page-2</loc></url>
1287+ </urlset>` ,
1288+ { status : 200 , headers : { 'Content-Type' : 'application/xml' } } ,
1289+ ) ,
1290+ ) ,
1291+ ) ;
1292+
1293+ const ctx = createContext ( 'http://bare-www.local' , { requestDelay : 0 } ) ;
1294+ const warnings : string [ ] = [ ] ;
1295+ const result = await getUrlsFromSitemap ( ctx , warnings , { skipRefinement : true } ) ;
1296+ expect ( result ) . toEqual ( [
1297+ 'http://www.bare-www.local/page-1' ,
1298+ 'http://www.bare-www.local/page-2' ,
1299+ ] ) ;
1300+ } ) ;
1301+
1302+ it ( 'still rejects truly cross-host sitemap URLs' , async ( ) => {
1303+ // Sanity check: www-equivalence does not relax filtering for unrelated hosts.
1304+ mockSitemapNotFound ( server , 'http://strict-host.local' ) ;
1305+ server . use (
1306+ http . get (
1307+ 'http://strict-host.local/robots.txt' ,
1308+ ( ) => new HttpResponse ( 'User-agent: *\n' , { status : 200 } ) ,
1309+ ) ,
1310+ http . get (
1311+ 'http://strict-host.local/sitemap.xml' ,
1312+ ( ) =>
1313+ new HttpResponse (
1314+ `<?xml version="1.0"?>
1315+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
1316+ <url><loc>http://strict-host.local/keep</loc></url>
1317+ <url><loc>http://other-host.local/drop</loc></url>
1318+ <url><loc>https://strict-host.local/drop-scheme</loc></url>
1319+ </urlset>` ,
1320+ { status : 200 , headers : { 'Content-Type' : 'application/xml' } } ,
1321+ ) ,
1322+ ) ,
1323+ ) ;
1324+
1325+ const ctx = createContext ( 'http://strict-host.local' , { requestDelay : 0 } ) ;
1326+ const warnings : string [ ] = [ ] ;
1327+ const result = await getUrlsFromSitemap ( ctx , warnings , { skipRefinement : true } ) ;
1328+ expect ( result ) . toEqual ( [ 'http://strict-host.local/keep' ] ) ;
1329+ } ) ;
1330+
12101331 it ( 'warns and skips gzipped sitemap from robots.txt' , async ( ) => {
12111332 server . use (
12121333 http . get (
0 commit comments