@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
22import { MAX_SITEMAP_URLS } from '../constants.js' ;
33import { getLlmsTxtFilesForAnalysis , selectCanonicalLlmsTxt } from './llms-txt.js' ;
44import { isNonPageUrl , isMdUrl , toHtmlUrl } from './to-md-urls.js' ;
5+ import { isSameSite } from './host-equivalence.js' ;
56import { isLocaleSegment , hasStructuralDuplication } from './locale-codes.js' ;
67import type { CheckContext , DiscoveredFile } from '../types.js' ;
78
@@ -162,17 +163,19 @@ async function walkAggregateLinksWithOriginals(
162163 const omittedTxtUrls : string [ ] = [ ] ;
163164
164165 const siteOrigin = ctx . effectiveOrigin ?? ctx . origin ;
166+ const isAcceptedOrigin = ( url : string ) : boolean =>
167+ isSameSite ( url , ctx . origin ) || isSameSite ( url , siteOrigin ) ;
165168
166169 for ( const entry of entries ) {
167170 try {
168171 const parsed = new URL ( entry . url ) ;
169172 if ( / \. t x t $ / i. test ( parsed . pathname ) ) {
170173 // .txt files are either aggregate indexes to walk (same origin)
171174 // or external resources to skip — never page URLs themselves
172- if ( parsed . origin === ctx . origin || parsed . origin === siteOrigin ) {
175+ if ( isAcceptedOrigin ( entry . url ) ) {
173176 aggregateUrls . push ( entry . url ) ;
174177 }
175- } else if ( parsed . origin === ctx . origin || parsed . origin === siteOrigin ) {
178+ } else if ( isAcceptedOrigin ( entry . url ) ) {
176179 // Only include same-origin page URLs; cross-origin links are
177180 // external resources the site owner doesn't control.
178181 pageUrls . push ( entry ) ;
@@ -207,8 +210,7 @@ async function walkAggregateLinksWithOriginals(
207210 for ( const subEntry of subEntries ) {
208211 try {
209212 const parsed = new URL ( subEntry . url ) ;
210- const isSameOrigin = parsed . origin === ctx . origin || parsed . origin === siteOrigin ;
211- if ( ! isSameOrigin ) continue ;
213+ if ( ! isAcceptedOrigin ( subEntry . url ) ) continue ;
212214
213215 if ( / \. t x t $ / i. test ( parsed . pathname ) ) {
214216 // Depth-1 .txt link: record as omitted rather than descending
@@ -739,13 +741,13 @@ export async function getUrlsFromSitemap(
739741
740742 function shouldInclude ( url : string ) : boolean {
741743 try {
742- const u = new URL ( url ) ;
743- if ( u . origin !== matchOrigin ) return false ;
744- if ( prefixPath ) return matchesPathPrefix ( url , prefixPath ) ;
745- return true ;
744+ new URL ( url ) ;
746745 } catch {
747746 return false ;
748747 }
748+ if ( ! isSameSite ( url , matchOrigin ) ) return false ;
749+ if ( prefixPath ) return matchesPathPrefix ( url , prefixPath ) ;
750+ return true ;
749751 }
750752
751753 // Collect up to collectLimit URLs before refinement. The cap is applied
@@ -800,35 +802,29 @@ export async function getUrlsFromSitemap(
800802 return deduplicated . slice ( 0 , maxUrls ) ;
801803}
802804
803- function isWwwVariant ( hostname1 : string , hostname2 : string ) : boolean {
804- return hostname1 === `www.${ hostname2 } ` || hostname2 === `www.${ hostname1 } ` ;
805- }
806-
807805/**
808806 * Get the base URL for path-prefix filtering, accounting for cross-host redirects.
809807 *
810808 * When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
811809 * the original baseUrl path doesn't apply to the redirected host, so we return the
812810 * effectiveOrigin (a root URL) which makes path filtering a no-op.
813811 *
814- * When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
815- * the path structure is preserved, so we transfer the baseUrl's path to the
816- * effective origin to keep path-prefix filtering active.
812+ * When the redirect stays on the same site (e.g. www-canonicalization or an
813+ * http→https upgrade), the path structure is preserved, so we transfer the
814+ * baseUrl's path to the effective origin to keep path-prefix filtering active.
817815 */
818816export function getPathFilterBase ( ctx : CheckContext ) : string {
819817 if ( ! ctx . effectiveOrigin || ctx . effectiveOrigin === ctx . origin ) {
820818 return ctx . baseUrl ;
821819 }
822820
823- try {
824- const originalHost = new URL ( ctx . origin ) . hostname ;
825- const effectiveHost = new URL ( ctx . effectiveOrigin ) . hostname ;
826- if ( isWwwVariant ( originalHost , effectiveHost ) ) {
821+ if ( isSameSite ( ctx . origin , ctx . effectiveOrigin ) ) {
822+ try {
827823 const basePath = new URL ( ctx . baseUrl ) . pathname . replace ( / \/ $ / , '' ) ;
828824 return basePath ? `${ ctx . effectiveOrigin } ${ basePath } ` : ctx . effectiveOrigin ;
825+ } catch {
826+ // fall through
829827 }
830- } catch {
831- // fall through
832828 }
833829
834830 return ctx . effectiveOrigin ;
0 commit comments