@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
22import { MAX_SITEMAP_URLS } from '../constants.js' ;
33import { getLlmsTxtFilesForAnalysis , selectCanonicalLlmsTxt } from './llms-txt.js' ;
44import { isNonPageUrl , isMdUrl , toHtmlUrl } from './to-md-urls.js' ;
5+ import { isLocaleSegment , hasStructuralDuplication } from './locale-codes.js' ;
56import type { CheckContext , DiscoveredFile } from '../types.js' ;
67
78/**
@@ -286,7 +287,7 @@ export function extractLocaleFromUrl(url: string): string | null {
286287 const segments = new URL ( url ) . pathname . split ( '/' ) . filter ( Boolean ) ;
287288 // Only check the first 3 segments to avoid matching content paths
288289 for ( let i = 0 ; i < Math . min ( segments . length , 3 ) ; i ++ ) {
289- if ( / ^ [ a - z ] { 2 } ( - [ a - z ] { 2 } ) ? $ / i . test ( segments [ i ] ) ) {
290+ if ( isLocaleSegment ( segments [ i ] ) ) {
290291 return segments [ i ] . toLowerCase ( ) ;
291292 }
292293 }
@@ -325,7 +326,7 @@ export function filterLocaleSitemaps(
325326 const pathMatch = pathLocalePattern . exec ( url ) ;
326327 const match = filenameMatch ?? pathMatch ;
327328
328- if ( match ) {
329+ if ( match && isLocaleSegment ( match [ 1 ] ) ) {
329330 const locale = match [ 1 ] . toLowerCase ( ) ;
330331 if ( ! locales . has ( locale ) ) locales . set ( locale , [ ] ) ;
331332 locales . get ( locale ) ! . push ( url ) ;
@@ -366,7 +367,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
366367 const segments = new URL ( url ) . pathname . split ( '/' ) . filter ( Boolean ) ;
367368 for ( let i = 0 ; i < segments . length ; i ++ ) {
368369 const seg = segments [ i ] . toLowerCase ( ) ;
369- if ( / ^ [ a - z ] { 2 } ( - [ a - z ] { 2 } ) ? $ / . test ( seg ) ) {
370+ if ( isLocaleSegment ( seg ) ) {
370371 if ( ! positionCounts . has ( i ) ) positionCounts . set ( i , new Map ( ) ) ;
371372 const counts = positionCounts . get ( i ) ! ;
372373 counts . set ( seg , ( counts . get ( seg ) ?? 0 ) + 1 ) ;
@@ -380,6 +381,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
380381
381382 // Find the position that looks like a locale segment
382383 let localePosition : number | null = null ;
384+ // First pass: ≥2 distinct locale codes covering >50% of URLs
383385 for ( const [ pos , counts ] of positionCounts ) {
384386 if ( counts . size < 2 ) continue ;
385387 const total = positionTotals . get ( pos ) ?? 0 ;
@@ -388,6 +390,17 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
388390 break ;
389391 }
390392 }
393+ // Second pass: single locale code confirmed by structural duplication
394+ if ( localePosition === null ) {
395+ for ( const [ pos , counts ] of positionCounts ) {
396+ if ( counts . size !== 1 ) continue ;
397+ const [ code ] = counts . keys ( ) ;
398+ if ( hasStructuralDuplication ( urls , pos , code ) ) {
399+ localePosition = pos ;
400+ break ;
401+ }
402+ }
403+ }
391404
392405 if ( localePosition === null ) return urls ;
393406
@@ -403,8 +416,22 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
403416 }
404417 } ) ;
405418
406- // If filtering removed everything (target locale not present), return original
407- return filtered . length > 0 ? filtered : urls ;
419+ if ( filtered . length > 0 ) return filtered ;
420+
421+ // Target locale not found. The default language may use unprefixed paths
422+ // (e.g. /docs/intro instead of /docs/en/intro). Filter to URLs that don't
423+ // have any locale code at the detected position.
424+ const unprefixed = urls . filter ( ( url ) => {
425+ try {
426+ const segments = new URL ( url ) . pathname . split ( '/' ) . filter ( Boolean ) ;
427+ if ( segments . length <= localePosition ! ) return true ;
428+ return ! isLocaleSegment ( segments [ localePosition ! ] ) ;
429+ } catch {
430+ return true ;
431+ }
432+ } ) ;
433+
434+ return unprefixed . length > 0 ? unprefixed : urls ;
408435}
409436
410437/**
0 commit comments