fix(scraper): use JSDOM title property for robust HTML title extraction

arabold · arabold · commit dee350f48242 · 2025-04-14T08:26:05.000-07:00
Replaces the fragile regex-based title extraction in HtmlProcessor with the more robust window.document.title property provided by JSDOM. This correctly handles cases where the <title> tag contains attributes. Adds a test case to verify title extraction with attributes. Fixes #41
diff --git a/src/scraper/processor/HtmlProcessor.test.ts b/src/scraper/processor/HtmlProcessor.test.ts
@@ -23,6 +23,20 @@ describe("HtmlProcessor", () => {
     expect(result.links).toEqual([]); // No links in this example
   });
 
+  it("should process HTML with attributes in the title tag", async () => {
+    const processor = new HtmlProcessor();
+    const rawContent: RawContent = {
+      content:
+        '<html><head><title lang="en">Title With Attributes</title></head><body><h1>Hello</h1></body></html>',
+      mimeType: "text/html",
+      source: "https://example.com",
+    };
+
+    const result = await processor.process(rawContent);
+    expect(result.title).toBe("Title With Attributes");
+    expect(result.content).toContain("# Hello");
+  });
+
   it("should extract links", async () => {
     const processor = new HtmlProcessor();
     const rawContent: RawContent = {
diff --git a/src/scraper/processor/HtmlProcessor.ts b/src/scraper/processor/HtmlProcessor.ts
@@ -162,11 +162,10 @@ export class HtmlProcessor implements ContentProcessor {
         ? content.content
         : content.content.toString((content.encoding as BufferEncoding) || "utf-8");
 
-    // Find title
-    const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
-    const title = titleMatch?.[1] || "Untitled";
+    const window = new JSDOM(htmlContent, { url: content.source }).window;
 
-    const window = new JSDOM(content.content, { url: content.source }).window;
+    // Extract title using JSDOM
+    const title = window.document.title || "Untitled";
 
     const purify = createDOMPurify(window as unknown as WindowLike);
     const purifiedContent = purify.sanitize(htmlContent, {