Skip to content

Commit b0dce29

Browse files
authored
Merge pull request #60 from agent-ecosystem/fix/missing-locale-prefix-handling
Fix: missing locale, false locale regex matches, related bugs
2 parents 21b43e6 + 56e3e41 commit b0dce29

6 files changed

Lines changed: 625 additions & 9 deletions

File tree

src/checks/observability/llms-txt-freshness.ts

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
parseSitemapUrls,
66
} from '../../helpers/get-page-urls.js';
77
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
8+
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
89
import type { CheckContext, CheckResult } from '../../types.js';
910

1011
/**
@@ -95,7 +96,7 @@ export function detectLocalePosition(urls: string[]): number | null {
9596
const segments = new URL(url).pathname.split('/').filter(Boolean);
9697
for (let i = 0; i < segments.length; i++) {
9798
const seg = segments[i].toLowerCase();
98-
if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
99+
if (isLocaleSegment(seg)) {
99100
if (!positionCounts.has(i)) positionCounts.set(i, new Map());
100101
const counts = positionCounts.get(i)!;
101102
counts.set(seg, (counts.get(seg) ?? 0) + 1);
@@ -107,6 +108,7 @@ export function detectLocalePosition(urls: string[]): number | null {
107108
}
108109
}
109110

111+
// First pass: ≥2 distinct locale codes covering >50% of URLs (strong signal)
110112
for (const [pos, counts] of positionCounts) {
111113
if (counts.size < 2) continue;
112114
const total = positionTotals.get(pos) ?? 0;
@@ -115,6 +117,17 @@ export function detectLocalePosition(urls: string[]): number | null {
115117
}
116118
}
117119

120+
// Second pass: single locale code confirmed by structural duplication.
121+
// With ISO 639-1 validation, a single code is meaningful when stripping it
122+
// produces paths that match unprefixed URLs in the same set.
123+
for (const [pos, counts] of positionCounts) {
124+
if (counts.size !== 1) continue;
125+
const [code] = counts.keys();
126+
if (hasStructuralDuplication(urls, pos, code)) {
127+
return pos;
128+
}
129+
}
130+
118131
return null;
119132
}
120133

@@ -163,6 +176,27 @@ function filterByLocale(urls: string[], locale: string, position: number): strin
163176
});
164177
}
165178

179+
/**
180+
* Test whether a URL has a locale code at the given path position.
181+
*/
182+
export function hasLocaleCodeAt(url: string, position: number): boolean {
183+
try {
184+
const segments = new URL(url).pathname.split('/').filter(Boolean);
185+
return segments.length > position && isLocaleSegment(segments[position]);
186+
} catch {
187+
return false;
188+
}
189+
}
190+
191+
/**
192+
* Filter URLs to only those that do NOT have a locale code at `position`.
193+
* Used when llms.txt covers the unprefixed default locale and we need to
194+
* exclude locale-prefixed sitemap variants from coverage comparison.
195+
*/
196+
export function filterToUnprefixedLocale(urls: string[], position: number): string[] {
197+
return urls.filter((url) => !hasLocaleCodeAt(url, position));
198+
}
199+
166200
/** Coverage thresholds */
167201
const COVERAGE_PASS = 0.95;
168202
const COVERAGE_WARN = 0.8;
@@ -316,6 +350,17 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
316350
const before = scopedSitemapUrls.length;
317351
scopedSitemapUrls = filterByLocale(scopedSitemapUrls, llmsLocale, localePosition);
318352
localeFiltered = scopedSitemapUrls.length < before;
353+
} else {
354+
// llms.txt may cover the unprefixed default locale (no /en/, /de/, etc.).
355+
// If most llms.txt URLs lack locale codes at the detected position,
356+
// filter the sitemap to only unprefixed URLs.
357+
const withLocale = llmsTxtUrls.filter((u) => hasLocaleCodeAt(u, localePosition!)).length;
358+
if (withLocale < llmsTxtUrls.length * 0.5) {
359+
const before = scopedSitemapUrls.length;
360+
scopedSitemapUrls = filterToUnprefixedLocale(scopedSitemapUrls, localePosition);
361+
localeFiltered = scopedSitemapUrls.length < before;
362+
if (localeFiltered) detectedLocale = 'default';
363+
}
319364
}
320365
}
321366

src/helpers/get-page-urls.ts

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
22
import { MAX_SITEMAP_URLS } from '../constants.js';
33
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
44
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
5+
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
56
import type { CheckContext, DiscoveredFile } from '../types.js';
67

78
/**
@@ -286,7 +287,7 @@ export function extractLocaleFromUrl(url: string): string | null {
286287
const segments = new URL(url).pathname.split('/').filter(Boolean);
287288
// Only check the first 3 segments to avoid matching content paths
288289
for (let i = 0; i < Math.min(segments.length, 3); i++) {
289-
if (/^[a-z]{2}(-[a-z]{2})?$/i.test(segments[i])) {
290+
if (isLocaleSegment(segments[i])) {
290291
return segments[i].toLowerCase();
291292
}
292293
}
@@ -325,7 +326,7 @@ export function filterLocaleSitemaps(
325326
const pathMatch = pathLocalePattern.exec(url);
326327
const match = filenameMatch ?? pathMatch;
327328

328-
if (match) {
329+
if (match && isLocaleSegment(match[1])) {
329330
const locale = match[1].toLowerCase();
330331
if (!locales.has(locale)) locales.set(locale, []);
331332
locales.get(locale)!.push(url);
@@ -366,7 +367,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
366367
const segments = new URL(url).pathname.split('/').filter(Boolean);
367368
for (let i = 0; i < segments.length; i++) {
368369
const seg = segments[i].toLowerCase();
369-
if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
370+
if (isLocaleSegment(seg)) {
370371
if (!positionCounts.has(i)) positionCounts.set(i, new Map());
371372
const counts = positionCounts.get(i)!;
372373
counts.set(seg, (counts.get(seg) ?? 0) + 1);
@@ -380,6 +381,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
380381

381382
// Find the position that looks like a locale segment
382383
let localePosition: number | null = null;
384+
// First pass: ≥2 distinct locale codes covering >50% of URLs
383385
for (const [pos, counts] of positionCounts) {
384386
if (counts.size < 2) continue;
385387
const total = positionTotals.get(pos) ?? 0;
@@ -388,6 +390,17 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
388390
break;
389391
}
390392
}
393+
// Second pass: single locale code confirmed by structural duplication
394+
if (localePosition === null) {
395+
for (const [pos, counts] of positionCounts) {
396+
if (counts.size !== 1) continue;
397+
const [code] = counts.keys();
398+
if (hasStructuralDuplication(urls, pos, code)) {
399+
localePosition = pos;
400+
break;
401+
}
402+
}
403+
}
391404

392405
if (localePosition === null) return urls;
393406

@@ -403,8 +416,22 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
403416
}
404417
});
405418

406-
// If filtering removed everything (target locale not present), return original
407-
return filtered.length > 0 ? filtered : urls;
419+
if (filtered.length > 0) return filtered;
420+
421+
// Target locale not found. The default language may use unprefixed paths
422+
// (e.g. /docs/intro instead of /docs/en/intro). Filter to URLs that don't
423+
// have any locale code at the detected position.
424+
const unprefixed = urls.filter((url) => {
425+
try {
426+
const segments = new URL(url).pathname.split('/').filter(Boolean);
427+
if (segments.length <= localePosition!) return true;
428+
return !isLocaleSegment(segments[localePosition!]);
429+
} catch {
430+
return true;
431+
}
432+
});
433+
434+
return unprefixed.length > 0 ? unprefixed : urls;
408435
}
409436

410437
/**

0 commit comments

Comments
 (0)