Skip to content

Commit c003a99

Browse files
authored
Merge pull request #35 from agent-ecosystem/fix-content-start-position
fix: html-to-markdown flattening markdown tables
2 parents 1bb8afa + 6dbaef1 commit c003a99

6 files changed

Lines changed: 77 additions & 0 deletions

File tree

package-lock.json

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
"commander": "^13.1.0",
8181
"node-html-parser": "^7.1.0",
8282
"turndown": "^7.2.2",
83+
"turndown-plugin-gfm": "^1.0.2",
8384
"yaml": "^2.7.0"
8485
},
8586
"devDependencies": {

src/checks/page-size/content-start-position.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ function headingFollowedByContent(lines: string[], headingIdx: number): boolean
7474
const nextLine = i + 1 < lines.length ? lines[i + 1].trim() : '';
7575
if (/^[=-]+$/.test(nextLine) && nextLine.length >= 2) return false;
7676

77+
// Markdown table row (e.g. "| Col A | Col B |")
78+
if (/^\|.+\|/.test(t)) return true;
79+
80+
// HTML table tags inside .md files (not converted by htmlToMarkdown)
81+
if (/^<table[\s>]/i.test(t) || /^<tr[\s>]/i.test(t)) return true;
82+
7783
// A line > NAV_MAX_LENGTH that isn't a link is likely real prose
7884
if (t.length > NAV_MAX_LENGTH && linkDensity(t) < 0.5) return true;
7985

src/helpers/html-to-markdown.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import TurndownService from 'turndown';
2+
import { tables } from 'turndown-plugin-gfm';
23

34
/**
45
* Convert HTML to markdown using Turndown with default configuration.
56
* Matches real agent behavior per the Agent-Friendly Documentation Spec:
67
* no explicit <style>/<script> stripping, default options only.
8+
* The GFM tables plugin is enabled so HTML tables are preserved as markdown
9+
* tables rather than being flattened to plain text.
710
*/
811
export function htmlToMarkdown(html: string): string {
912
const turndown = new TurndownService();
13+
turndown.use(tables);
1014
return turndown.turndown(html);
1115
}

src/types/turndown-plugin-gfm.d.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
declare module 'turndown-plugin-gfm' {
2+
import type TurndownService from 'turndown';
3+
4+
type TurndownPlugin = (service: TurndownService) => void;
5+
6+
export const gfm: TurndownPlugin;
7+
export const tables: TurndownPlugin;
8+
export const strikethrough: TurndownPlugin;
9+
export const taskListItems: TurndownPlugin;
10+
export const highlightedCodeBlock: TurndownPlugin;
11+
}

test/unit/checks/content-start-position.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,54 @@ describe('content-start-position', () => {
507507
expect(result.status).toBe('pass');
508508
});
509509

510+
// ── Table content after headings (issue #20) ──
511+
512+
it('detects markdown table as content after heading', async () => {
513+
const html = `<html><body>
514+
<h1>Limits for Scheduled Triggers</h1>
515+
<table><tr><th>Trigger interval</th><th>Max executions</th></tr>
516+
<tr><td>Every 5 minutes</td><td>50 per hour</td></tr></table>
517+
</body></html>`;
518+
519+
server.use(
520+
http.get(
521+
'http://test.local/docs/table-after-heading',
522+
() => new HttpResponse(html, { status: 200, headers: { 'Content-Type': 'text/html' } }),
523+
),
524+
);
525+
526+
const result = await check.run(singlePageCtx('/docs/table-after-heading'));
527+
expect(result.status).toBe('pass');
528+
});
529+
530+
it('detects markdown table in .md content as content after heading', async () => {
531+
const md = `# Limits\n\n| Trigger interval | Max executions |\n|-----------------|----------------|\n| Every 5 minutes | 50 per hour |\n`;
532+
533+
server.use(
534+
http.get(
535+
'http://test.local/docs/table-md',
536+
() => new HttpResponse(md, { status: 200, headers: { 'Content-Type': 'text/markdown' } }),
537+
),
538+
);
539+
540+
const result = await check.run(singlePageCtx('/docs/table-md'));
541+
expect(result.status).toBe('pass');
542+
});
543+
544+
it('detects HTML table in .md content as content after heading', async () => {
545+
const md = `# Limits\n\n<table>\n <tr><th>Col A</th><th>Col B</th></tr>\n <tr><td>val</td><td>val</td></tr>\n</table>\n`;
546+
547+
server.use(
548+
http.get(
549+
'http://test.local/docs/table-html-in-md',
550+
() => new HttpResponse(md, { status: 200, headers: { 'Content-Type': 'text/markdown' } }),
551+
),
552+
);
553+
554+
const result = await check.run(singlePageCtx('/docs/table-html-in-md'));
555+
expect(result.status).toBe('pass');
556+
});
557+
510558
// ── Lines that don't match any skip pattern and aren't prose ──
511559

512560
it('skips short multi-word non-prose lines (breadcrumbs)', async () => {

0 commit comments

Comments
 (0)