Merge pull request #44 from mvvmm/fix/normalize-md-urls-in-discovery

dacharyc · web-flow · commit 6bb7d6b4c065 · 2026-04-19T11:50:18.000-04:00
fix: normalize .md URLs to HTML equivalents during page discovery
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
@@ -1,6 +1,6 @@
 import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt-valid.js';
 import { MAX_SITEMAP_URLS } from '../constants.js';
-import { isNonPageUrl } from './to-md-urls.js';
+import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
 import type { CheckContext, DiscoveredFile } from '../types.js';
 
 /**
@@ -44,18 +44,28 @@ export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise<strin
   return walkAggregateLinks(ctx, urls);
 }
 
+/**
+ * Normalize a discovered page URL: convert .md/.mdx URLs to their HTML
+ * equivalent so that llms.txt entries like `/docs/guide/index.md` deduplicate
+ * against sitemap entries like `/docs/guide/`. Markdown-specific checks are
+ * unaffected because they derive .md candidates from HTML URLs via toMdUrls().
+ */
+function normalizePageUrl(url: string): string {
+  return isMdUrl(url) ? toHtmlUrl(url) : url;
+}
+
 function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] {
   const urls = new Set<string>();
   for (const file of files) {
     const links = extractMarkdownLinks(file.content);
     for (const link of links) {
       if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
-        urls.add(link.url);
+        urls.add(normalizePageUrl(link.url));
       } else if (link.url.startsWith('/')) {
         // Resolve root-relative URLs against the source file's origin
         try {
           const base = new URL(file.url);
-          urls.add(new URL(link.url, base.origin).toString());
+          urls.add(normalizePageUrl(new URL(link.url, base.origin).toString()));
         } catch {
           // Skip malformed URLs
         }
@@ -91,10 +101,10 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<st
       } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
         // Only include same-origin page URLs; cross-origin links are
         // external resources the site owner doesn't control.
-        pageUrls.push(url);
+        pageUrls.push(normalizePageUrl(url));
       }
     } catch {
-      pageUrls.push(url);
+      pageUrls.push(normalizePageUrl(url));
     }
   }
 
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
@@ -1191,9 +1191,10 @@ describe('getPageUrls', () => {
 
     const ctx = makeCtx('http://walk-test.local', rootContent);
     const result = await getPageUrls(ctx);
-    expect(result.urls).toContain('http://walk-test.local/workers/guide/index.md');
-    expect(result.urls).toContain('http://walk-test.local/workers/api/index.md');
-    expect(result.urls).toContain('http://walk-test.local/cache/overview/index.md');
+    // .md URLs from llms.txt are normalized to their HTML equivalents
+    expect(result.urls).toContain('http://walk-test.local/workers/guide/');
+    expect(result.urls).toContain('http://walk-test.local/workers/api/');
+    expect(result.urls).toContain('http://walk-test.local/cache/overview/');
     expect(result.urls).toHaveLength(3);
   });
 
@@ -1296,6 +1297,43 @@ describe('getPageUrls', () => {
     expect(result.urls).toEqual(['http://walk-empty.local/docs/page']);
   });
 
+  // ── .md URL normalization ──
+
+  it('normalizes .md URLs from llms.txt to HTML equivalents', async () => {
+    const content = `# Docs\n- [Guide](http://md-norm.local/docs/guide/index.md): Guide\n- [API](http://md-norm.local/docs/api.md): API\n`;
+    const ctx = makeCtx('http://md-norm.local', content);
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toContain('http://md-norm.local/docs/guide/');
+    expect(result.urls).toContain('http://md-norm.local/docs/api');
+    expect(result.urls).not.toContain('http://md-norm.local/docs/guide/index.md');
+    expect(result.urls).not.toContain('http://md-norm.local/docs/api.md');
+  });
+
+  it('deduplicates .md and HTML URLs for the same page', async () => {
+    // llms.txt has .md URL, sitemap has HTML URL for the same page
+    const content = `# Docs\n- [Guide](http://md-dedup.local/docs/guide/index.md): Guide\n`;
+    const ctx = makeCtx('http://md-dedup.local', content);
+
+    server.use(
+      http.get('http://md-dedup.local/robots.txt', () => new HttpResponse('', { status: 404 })),
+      http.get(
+        'http://md-dedup.local/sitemap.xml',
+        () =>
+          new HttpResponse(
+            `<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>http://md-dedup.local/docs/guide/</loc></url><url><loc>http://md-dedup.local/docs/other/</loc></url></urlset>`,
+            { status: 200, headers: { 'Content-Type': 'application/xml' } },
+          ),
+      ),
+    );
+
+    const result = await getPageUrls(ctx);
+    // /docs/guide/ should appear only once (not twice for .md + HTML)
+    const guideCount = result.urls.filter((u) => u === 'http://md-dedup.local/docs/guide/').length;
+    expect(guideCount).toBe(1);
+    // /docs/other/ from sitemap should still be present
+    expect(result.urls).toContain('http://md-dedup.local/docs/other/');
+  });
+
   // ── Direct llms.txt fetch (standalone mode) ──
 
   it('fetches llms.txt directly when llms-txt-exists has not run', async () => {