Skip to content

Commit 1e76de0

Browse files
authored
Merge pull request #42 from mvvmm/fix/strip-code-in-looks-like-html
fix: strip code blocks before HTML detection in looksLikeHtml()
2 parents abc5327 + 57f3d65 commit 1e76de0

2 files changed

Lines changed: 42 additions & 1 deletion

File tree

src/helpers/detect-markdown.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,26 @@ const MD_HEADING = /^#{1,6}\s+\S/m;
44
const MD_LINK = /\[[^\]]+\]\([^)]+\)/;
55
const MD_CODE_FENCE = /^```/m;
66

7+
/**
8+
* Strip fenced code blocks and inline code spans so that HTML tags mentioned
9+
* inside code (e.g. `<body>` or a fenced HTML snippet) don't produce false
10+
* positives when checking for HTML patterns.
11+
*/
12+
function stripCode(text: string): string {
13+
// Strip fenced code blocks (``` or ~~~, with optional language tag)
14+
text = text.replace(/^(`{3,}|~{3,})[^\n]*\n[\s\S]*?\n\1[ \t]*$/gm, '');
15+
// Strip inline code spans
16+
text = text.replace(/`[^`\n]+`/g, '``');
17+
return text;
18+
}
19+
720
/**
821
* Returns true if the body looks like HTML (contains DOCTYPE, <html>, <head>, or <body> tags).
22+
* Fenced code blocks and inline code spans are stripped first to avoid false positives
23+
* from markdown that mentions HTML tags in code examples.
924
*/
1025
export function looksLikeHtml(body: string): boolean {
11-
const sample = body.slice(0, 2000);
26+
const sample = stripCode(body.slice(0, 2000));
1227
return HTML_PATTERNS.some((p) => p.test(sample));
1328
}
1429

test/unit/helpers/detect-markdown.test.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,26 @@ describe('looksLikeHtml', () => {
2626
it('returns false for markdown', () => {
2727
expect(looksLikeHtml('# Hello\n\nThis is **markdown**.')).toBe(false);
2828
});
29+
30+
it('ignores HTML tags inside fenced code blocks', () => {
31+
const md = '# Example\n\n```html\n<!DOCTYPE html>\n<html>\n<body>Hello</body>\n</html>\n```\n';
32+
expect(looksLikeHtml(md)).toBe(false);
33+
});
34+
35+
it('ignores HTML tags inside inline code spans', () => {
36+
const md = '# Setup\n\nAdd the script before the closing `</body>` tag.\n';
37+
expect(looksLikeHtml(md)).toBe(false);
38+
});
39+
40+
it('ignores HTML tags inside tilde fenced code blocks', () => {
41+
const md = '# Example\n\n~~~html\n<html>\n<head><title>Test</title></head>\n</html>\n~~~\n';
42+
expect(looksLikeHtml(md)).toBe(false);
43+
});
44+
45+
it('still detects real HTML outside of code blocks', () => {
46+
const html = '<!DOCTYPE html>\n<html>\n```not a code block\n```\n</html>';
47+
expect(looksLikeHtml(html)).toBe(true);
48+
});
2949
});
3050

3151
describe('looksLikeMarkdown', () => {
@@ -48,4 +68,10 @@ describe('looksLikeMarkdown', () => {
4868
it('returns false for plain text with no markdown signals', () => {
4969
expect(looksLikeMarkdown('Just some plain text without any formatting.')).toBe(false);
5070
});
71+
72+
it('returns true for markdown containing HTML examples in code', () => {
73+
const md =
74+
'# Web API\n\nAdd the script before `</body>`.\n\n```html\n<html><body>Hello</body></html>\n```\n';
75+
expect(looksLikeMarkdown(md)).toBe(true);
76+
});
5177
});

0 commit comments

Comments
 (0)