Skip to content

Commit 6bb7d6b

Browse files
authored
Merge pull request #44 from mvvmm/fix/normalize-md-urls-in-discovery
fix: normalize .md URLs to HTML equivalents during page discovery
2 parents 4498afe + e5fdaaa commit 6bb7d6b

2 files changed

Lines changed: 56 additions & 8 deletions

File tree

src/helpers/get-page-urls.ts

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt-valid.js';
22
import { MAX_SITEMAP_URLS } from '../constants.js';
3-
import { isNonPageUrl } from './to-md-urls.js';
3+
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
44
import type { CheckContext, DiscoveredFile } from '../types.js';
55

66
/**
@@ -44,18 +44,28 @@ export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise<strin
4444
return walkAggregateLinks(ctx, urls);
4545
}
4646

47+
/**
48+
* Normalize a discovered page URL: convert .md/.mdx URLs to their HTML
49+
* equivalent so that llms.txt entries like `/docs/guide/index.md` deduplicate
50+
* against sitemap entries like `/docs/guide/`. Markdown-specific checks are
51+
* unaffected because they derive .md candidates from HTML URLs via toMdUrls().
52+
*/
53+
function normalizePageUrl(url: string): string {
54+
return isMdUrl(url) ? toHtmlUrl(url) : url;
55+
}
56+
4757
function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] {
4858
const urls = new Set<string>();
4959
for (const file of files) {
5060
const links = extractMarkdownLinks(file.content);
5161
for (const link of links) {
5262
if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
53-
urls.add(link.url);
63+
urls.add(normalizePageUrl(link.url));
5464
} else if (link.url.startsWith('/')) {
5565
// Resolve root-relative URLs against the source file's origin
5666
try {
5767
const base = new URL(file.url);
58-
urls.add(new URL(link.url, base.origin).toString());
68+
urls.add(normalizePageUrl(new URL(link.url, base.origin).toString()));
5969
} catch {
6070
// Skip malformed URLs
6171
}
@@ -91,10 +101,10 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<st
91101
} else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
92102
// Only include same-origin page URLs; cross-origin links are
93103
// external resources the site owner doesn't control.
94-
pageUrls.push(url);
104+
pageUrls.push(normalizePageUrl(url));
95105
}
96106
} catch {
97-
pageUrls.push(url);
107+
pageUrls.push(normalizePageUrl(url));
98108
}
99109
}
100110

test/unit/helpers/get-page-urls.test.ts

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,9 +1191,10 @@ describe('getPageUrls', () => {
11911191

11921192
const ctx = makeCtx('http://walk-test.local', rootContent);
11931193
const result = await getPageUrls(ctx);
1194-
expect(result.urls).toContain('http://walk-test.local/workers/guide/index.md');
1195-
expect(result.urls).toContain('http://walk-test.local/workers/api/index.md');
1196-
expect(result.urls).toContain('http://walk-test.local/cache/overview/index.md');
1194+
// .md URLs from llms.txt are normalized to their HTML equivalents
1195+
expect(result.urls).toContain('http://walk-test.local/workers/guide/');
1196+
expect(result.urls).toContain('http://walk-test.local/workers/api/');
1197+
expect(result.urls).toContain('http://walk-test.local/cache/overview/');
11971198
expect(result.urls).toHaveLength(3);
11981199
});
11991200

@@ -1296,6 +1297,43 @@ describe('getPageUrls', () => {
12961297
expect(result.urls).toEqual(['http://walk-empty.local/docs/page']);
12971298
});
12981299

1300+
// ── .md URL normalization ──
1301+
1302+
it('normalizes .md URLs from llms.txt to HTML equivalents', async () => {
1303+
const content = `# Docs\n- [Guide](http://md-norm.local/docs/guide/index.md): Guide\n- [API](http://md-norm.local/docs/api.md): API\n`;
1304+
const ctx = makeCtx('http://md-norm.local', content);
1305+
const result = await getPageUrls(ctx);
1306+
expect(result.urls).toContain('http://md-norm.local/docs/guide/');
1307+
expect(result.urls).toContain('http://md-norm.local/docs/api');
1308+
expect(result.urls).not.toContain('http://md-norm.local/docs/guide/index.md');
1309+
expect(result.urls).not.toContain('http://md-norm.local/docs/api.md');
1310+
});
1311+
1312+
it('deduplicates .md and HTML URLs for the same page', async () => {
1313+
// llms.txt has .md URL, sitemap has HTML URL for the same page
1314+
const content = `# Docs\n- [Guide](http://md-dedup.local/docs/guide/index.md): Guide\n`;
1315+
const ctx = makeCtx('http://md-dedup.local', content);
1316+
1317+
server.use(
1318+
http.get('http://md-dedup.local/robots.txt', () => new HttpResponse('', { status: 404 })),
1319+
http.get(
1320+
'http://md-dedup.local/sitemap.xml',
1321+
() =>
1322+
new HttpResponse(
1323+
`<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>http://md-dedup.local/docs/guide/</loc></url><url><loc>http://md-dedup.local/docs/other/</loc></url></urlset>`,
1324+
{ status: 200, headers: { 'Content-Type': 'application/xml' } },
1325+
),
1326+
),
1327+
);
1328+
1329+
const result = await getPageUrls(ctx);
1330+
// /docs/guide/ should appear only once (not twice for .md + HTML)
1331+
const guideCount = result.urls.filter((u) => u === 'http://md-dedup.local/docs/guide/').length;
1332+
expect(guideCount).toBe(1);
1333+
// /docs/other/ from sitemap should still be present
1334+
expect(result.urls).toContain('http://md-dedup.local/docs/other/');
1335+
});
1336+
12991337
// ── Direct llms.txt fetch (standalone mode) ──
13001338

13011339
it('fetches llms.txt directly when llms-txt-exists has not run', async () => {

0 commit comments

Comments
 (0)