Skip to content

Commit 5dd624a

Browse files
committed
feat(scraper): replace JSDOM with Cheerio for HTML parsing
1 parent afd9ff1 commit 5dd624a

25 files changed

+524
-693
lines changed

package-lock.json

Lines changed: 208 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444
"axios": "^1.8.3",
4545
"axios-retry": "^4.5.0",
4646
"better-sqlite3": "^11.9.1",
47+
"cheerio": "^1.0.0",
4748
"commander": "^13.1.0",
48-
"dompurify": "^3.2.4",
4949
"dotenv": "^16.4.7",
5050
"env-paths": "^3.0.0",
5151
"fuse.js": "^7.1.0",

src/pipeline/PipelineWorker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ export class PipelineWorker {
2929
const { id: jobId, library, version, options, abortController } = job;
3030
const signal = abortController.signal;
3131

32-
logger.info(`[${jobId}] Worker starting job for ${library}@${version}`);
32+
logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
3333

3434
try {
3535
// --- Core Job Logic ---

src/scraper/middleware/ContentProcessorPipeline.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { describe, expect, it, vi } from "vitest";
22
import type { ScraperOptions } from "../types";
33
import { ContentProcessingPipeline } from "./ContentProcessorPipeline";
44
import {
5-
HtmlDomParserMiddleware,
5+
HtmlCheerioParserMiddleware, // Updated import
66
HtmlLinkExtractorMiddleware,
77
HtmlMetadataExtractorMiddleware,
88
HtmlSanitizerMiddleware,
@@ -55,10 +55,10 @@ const createPipelineTestContext = (
5555

5656
// Define the standard HTML pipeline for tests
5757
const htmlPipeline = new ContentProcessingPipeline([
58-
new HtmlDomParserMiddleware(),
58+
new HtmlCheerioParserMiddleware(),
5959
new HtmlMetadataExtractorMiddleware(),
6060
new HtmlLinkExtractorMiddleware(),
61-
new HtmlSanitizerMiddleware(), // Uses default selectors if none provided in context.options
61+
new HtmlSanitizerMiddleware(),
6262
new HtmlToMarkdownMiddleware(),
6363
]);
6464

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import * as cheerio from "cheerio";
2+
import { logger } from "../../../utils/logger";
3+
import type { ContentProcessingContext, ContentProcessorMiddleware } from "../types";
4+
5+
/**
6+
* Middleware to parse HTML string/buffer content into a Cheerio object.
7+
* It populates the `context.dom` property.
8+
* Assumes the input HTML in `context.content` is the final version to be parsed
9+
* (e.g., after potential rendering by Playwright or modification by JS execution).
10+
*/
11+
export class HtmlCheerioParserMiddleware implements ContentProcessorMiddleware {
12+
async process(
13+
context: ContentProcessingContext,
14+
next: () => Promise<void>,
15+
): Promise<void> {
16+
// Only process HTML content
17+
if (!context.contentType.startsWith("text/html")) {
18+
await next();
19+
return;
20+
}
21+
22+
// Ensure content is a string for Cheerio
23+
const htmlString =
24+
typeof context.content === "string"
25+
? context.content
26+
: Buffer.from(context.content).toString("utf-8");
27+
28+
try {
29+
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
30+
// Load the HTML string using Cheerio
31+
const $ = cheerio.load(htmlString);
32+
33+
// Add the Cheerio API object to the context
34+
context.dom = $;
35+
36+
// Proceed to the next middleware
37+
await next();
38+
} catch (error) {
39+
logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
40+
context.errors.push(
41+
error instanceof Error
42+
? error
43+
: new Error(`Cheerio HTML parsing failed: ${String(error)}`),
44+
);
45+
// Do not proceed further down the pipeline if parsing fails
46+
return;
47+
}
48+
}
49+
}

0 commit comments

Comments
 (0)