Skip to content

Commit dee350f

Browse files
committed
fix(scraper): use JSDOM title property for robust HTML title extraction
Replaces the fragile regex-based title extraction in HtmlProcessor with the more robust window.document.title property provided by JSDOM. This correctly handles cases where the <title> tag contains attributes. Adds a test case to verify title extraction with attributes. Fixes #41
1 parent 3ed31cc commit dee350f

File tree

2 files changed

+17
-4
lines changed

2 files changed

+17
-4
lines changed

src/scraper/processor/HtmlProcessor.test.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,20 @@ describe("HtmlProcessor", () => {
2323
expect(result.links).toEqual([]); // No links in this example
2424
});
2525

26+
it("should process HTML with attributes in the title tag", async () => {
27+
const processor = new HtmlProcessor();
28+
const rawContent: RawContent = {
29+
content:
30+
'<html><head><title lang="en">Title With Attributes</title></head><body><h1>Hello</h1></body></html>',
31+
mimeType: "text/html",
32+
source: "https://example.com",
33+
};
34+
35+
const result = await processor.process(rawContent);
36+
expect(result.title).toBe("Title With Attributes");
37+
expect(result.content).toContain("# Hello");
38+
});
39+
2640
it("should extract links", async () => {
2741
const processor = new HtmlProcessor();
2842
const rawContent: RawContent = {

src/scraper/processor/HtmlProcessor.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,10 @@ export class HtmlProcessor implements ContentProcessor {
162162
? content.content
163163
: content.content.toString((content.encoding as BufferEncoding) || "utf-8");
164164

165-
// Find title
166-
const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
167-
const title = titleMatch?.[1] || "Untitled";
165+
const window = new JSDOM(htmlContent, { url: content.source }).window;
168166

169-
const window = new JSDOM(content.content, { url: content.source }).window;
167+
// Extract title using JSDOM
168+
const title = window.document.title || "Untitled";
170169

171170
const purify = createDOMPurify(window as unknown as WindowLike);
172171
const purifiedContent = purify.sanitize(htmlContent, {

0 commit comments

Comments
 (0)