Skip to content

Commit e8e4beb

Browse files
committed
feat(cli): add --scrape-mode option and update README
1 parent ee3118f commit e8e4beb

File tree

13 files changed

+227
-7
lines changed

13 files changed

+227
-7
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,9 @@ docs-cli scrape <library> <url> [options]
292292
- `-p, --max-pages <number>`: Maximum pages to scrape (default: 1000).
293293
- `-d, --max-depth <number>`: Maximum navigation depth (default: 3).
294294
- `-c, --max-concurrency <number>`: Maximum concurrent requests (default: 3).
295+
- `--scope <scope>`: Defines the crawling boundary: 'subpages' (default), 'hostname', or 'domain'.
296+
- `--no-follow-redirects`: Disable following HTTP redirects (default: follow redirects).
297+
- `--scrape-mode <mode>`: HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default).
295298
- `--ignore-errors`: Ignore errors during scraping (default: true).
296299

297300
**Examples:**

src/cli.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,21 @@ async function main() {
9292
"--no-follow-redirects",
9393
"Disable following HTTP redirects (default: follow redirects)",
9494
)
95+
.option(
96+
"--scrape-mode <mode>",
97+
"HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default)",
98+
(value) => {
99+
const validModes = ["fetch", "playwright", "auto"];
100+
if (!validModes.includes(value)) {
101+
console.warn(
102+
`Warning: Invalid scrape mode '${value}'. Using default 'auto'.`,
103+
);
104+
return "auto";
105+
}
106+
return value;
107+
},
108+
"auto",
109+
)
95110
.action(async (library, url, options) => {
96111
// Update action parameters
97112
const result = await tools.scrape.execute({
@@ -105,6 +120,7 @@ async function main() {
105120
ignoreErrors: options.ignoreErrors,
106121
scope: options.scope,
107122
followRedirects: options.followRedirects, // This will be `true` by default, or `false` if --no-follow-redirects is used
123+
scrapeMode: options.scrapeMode, // Pass the new scrapeMode option
108124
},
109125
// CLI always waits for completion (default behavior)
110126
});
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { beforeEach, describe, expect, it, vi } from "vitest";
2+
import type { ContentProcessingContext } from "../types";
3+
import { HtmlDomParserMiddleware } from "./HtmlDomParserMiddleware";
4+
import { HtmlPlaywrightMiddleware } from "./HtmlPlaywrightMiddleware";
5+
import { HtmlSmartProcessorMiddleware } from "./HtmlSmartProcessorMiddleware";
6+
7+
// Mock the underlying processors
8+
vi.mock("./HtmlDomParserMiddleware");
9+
vi.mock("./HtmlPlaywrightMiddleware");
10+
11+
// Get typed mocks
12+
const MockedHtmlDomParserMiddleware = vi.mocked(HtmlDomParserMiddleware);
13+
const MockedHtmlPlaywrightMiddleware = vi.mocked(HtmlPlaywrightMiddleware);
14+
15+
describe("HtmlSmartProcessorMiddleware", () => {
16+
const mockNext = vi.fn().mockResolvedValue(undefined);
17+
18+
beforeEach(() => {
19+
// Clear mocks before each test to ensure isolation
20+
MockedHtmlDomParserMiddleware.mockClear();
21+
MockedHtmlPlaywrightMiddleware.mockClear();
22+
// Also clear any mocks on the process methods if they were added to prototypes (though we'll avoid that now)
23+
// It's safer to ensure the instances returned by the mock constructor have their method mocks cleared if necessary,
24+
// but vi.mock usually handles this. Let's rely on constructor mockClear for now.
25+
mockNext.mockClear();
26+
});
27+
28+
// Helper to create a basic context
29+
const createContext = (
30+
scrapeMode?: "fetch" | "playwright" | "auto",
31+
): ContentProcessingContext => ({
32+
content: "<html><body>Test</body></html>",
33+
contentType: "text/html",
34+
source: "http://example.com",
35+
metadata: {},
36+
links: [],
37+
errors: [],
38+
options: {
39+
url: "http://example.com",
40+
library: "test-lib",
41+
version: "1.0.0",
42+
scrapeMode, // Pass the mode here
43+
},
44+
});
45+
46+
it("should use HtmlDomParserMiddleware when scrapeMode is 'fetch'", async () => {
47+
const context = createContext("fetch");
48+
const middleware = new HtmlSmartProcessorMiddleware();
49+
50+
await middleware.process(context, mockNext);
51+
52+
// Verify constructor calls happened during middleware instantiation
53+
expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
54+
expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
55+
56+
// Get the instances created by the constructor
57+
const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
58+
const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
59+
60+
// Check that the correct instance's process method was called
61+
expect(domInstance.process).toHaveBeenCalledTimes(1);
62+
expect(domInstance.process).toHaveBeenCalledWith(context, mockNext);
63+
expect(playwrightInstance.process).not.toHaveBeenCalled();
64+
});
65+
66+
it("should use HtmlPlaywrightMiddleware when scrapeMode is 'playwright'", async () => {
67+
const context = createContext("playwright");
68+
const middleware = new HtmlSmartProcessorMiddleware();
69+
70+
await middleware.process(context, mockNext);
71+
72+
// Verify constructor calls happened during middleware instantiation
73+
expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
74+
expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
75+
76+
// Get the instances created by the constructor
77+
const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
78+
const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
79+
80+
// Check that the correct instance's process method was called
81+
expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
82+
expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
83+
expect(domInstance.process).not.toHaveBeenCalled();
84+
});
85+
86+
it("should use HtmlPlaywrightMiddleware when scrapeMode is 'auto'", async () => {
87+
const context = createContext("auto");
88+
const middleware = new HtmlSmartProcessorMiddleware();
89+
90+
await middleware.process(context, mockNext);
91+
92+
// Verify constructor calls happened during middleware instantiation
93+
expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
94+
expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
95+
96+
// Get the instances created by the constructor
97+
const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
98+
const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
99+
100+
// Check that the correct instance's process method was called
101+
expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
102+
expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
103+
expect(domInstance.process).not.toHaveBeenCalled();
104+
});
105+
106+
it("should default to 'auto' (Playwright) when scrapeMode is undefined", async () => {
107+
const context = createContext(undefined); // Explicitly undefined
108+
const middleware = new HtmlSmartProcessorMiddleware();
109+
110+
await middleware.process(context, mockNext);
111+
112+
// Verify constructor calls happened during middleware instantiation
113+
expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
114+
expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
115+
116+
// Get the instances created by the constructor
117+
const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
118+
const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
119+
120+
// Check that the correct instance's process method was called
121+
expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
122+
expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
123+
expect(domInstance.process).not.toHaveBeenCalled();
124+
});
125+
});
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import type { ContentProcessingContext, ContentProcessorMiddleware } from "../types";
2+
import { HtmlDomParserMiddleware } from "./HtmlDomParserMiddleware";
3+
import { HtmlPlaywrightMiddleware } from "./HtmlPlaywrightMiddleware";
4+
5+
/**
6+
* A content processing middleware that intelligently selects an HTML processing
7+
* strategy (DOM parsing or Playwright) based on the `scrapeMode` option.
8+
*
9+
* - 'fetch': Uses the lightweight `HtmlDomParserMiddleware`.
10+
* - 'playwright': Uses the full-featured `HtmlPlaywrightMiddleware`.
11+
* - 'auto': Currently defaults to using `HtmlPlaywrightMiddleware`. Future implementations
12+
* might add more sophisticated logic for 'auto' mode.
13+
*/
14+
export class HtmlSmartProcessorMiddleware implements ContentProcessorMiddleware {
15+
private readonly domProcessor: HtmlDomParserMiddleware;
16+
private readonly playwrightProcessor: HtmlPlaywrightMiddleware;
17+
18+
constructor() {
19+
this.domProcessor = new HtmlDomParserMiddleware();
20+
this.playwrightProcessor = new HtmlPlaywrightMiddleware();
21+
}
22+
23+
/**
24+
* Processes the content using the pre-instantiated HtmlDomParserMiddleware or HtmlPlaywrightMiddleware
25+
* based on the scrapeMode specified in the context options, then calls the next middleware.
26+
* @param context - The content processing context.
27+
* @param next - A function to call to pass control to the next middleware.
28+
*/
29+
async process(
30+
context: ContentProcessingContext,
31+
next: () => Promise<void>,
32+
): Promise<void> {
33+
// Default to 'auto' if scrapeMode is not provided
34+
const mode = context.options?.scrapeMode ?? "auto";
35+
36+
let selectedProcessor: ContentProcessorMiddleware;
37+
38+
if (mode === "fetch") {
39+
selectedProcessor = this.domProcessor;
40+
} else {
41+
// Default to Playwright for 'playwright' and 'auto' modes
42+
selectedProcessor = this.playwrightProcessor;
43+
}
44+
45+
// Run the selected pre-instantiated processor
46+
await selectedProcessor.process(context, next);
47+
}
48+
}

src/scraper/middleware/components/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ export * from "./HtmlLinkExtractorMiddleware";
44
export * from "./HtmlMetadataExtractorMiddleware";
55
export * from "./HtmlPlaywrightMiddleware";
66
export * from "./HtmlSanitizerMiddleware";
7+
export * from "./HtmlSmartProcessorMiddleware";
78
export * from "./HtmlToMarkdownMiddleware";
89
export * from "./MarkdownLinkExtractorMiddleware";
910
export * from "./MarkdownMetadataExtractorMiddleware";

src/scraper/strategies/BaseScraperStrategy.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ describe("BaseScraperStrategy", () => {
183183
"https://example.com/path", // Without trailing slash
184184
"https://example.com/path?q=1",
185185
"https://example.com/path?q=1#anchor", // With anchor
186-
"https://EXAMPLE.com/path", // Different case
186+
"https://example.com/path", // Different case
187187
],
188188
};
189189
}

src/scraper/strategies/WebScraperStrategy.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ describe("WebScraperStrategy", () => {
4848
scope: "subpages",
4949
// Ensure followRedirects has a default for tests if needed by fetch mock checks
5050
followRedirects: true,
51+
scrapeMode: "fetch", // fastest mode for testing
5152
};
5253

5354
// No need to mock prototype anymore

src/scraper/strategies/WebScraperStrategy.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ import type { RawContent } from "../fetcher/types";
77
import { ContentProcessingPipeline } from "../middleware/ContentProcessorPipeline";
88
// Import new and updated middleware from index
99
import {
10-
HtmlDomParserMiddleware,
1110
HtmlLinkExtractorMiddleware,
1211
HtmlMetadataExtractorMiddleware,
1312
HtmlSanitizerMiddleware,
13+
HtmlSmartProcessorMiddleware, // Import the new middleware
1414
HtmlToMarkdownMiddleware,
1515
MarkdownLinkExtractorMiddleware,
1616
MarkdownMetadataExtractorMiddleware,
@@ -99,7 +99,7 @@ export class WebScraperStrategy extends BaseScraperStrategy {
9999
if (initialContext.contentType.startsWith("text/html")) {
100100
// Updated HTML pipeline order
101101
pipeline = new ContentProcessingPipeline([
102-
new HtmlDomParserMiddleware(),
102+
new HtmlSmartProcessorMiddleware(), // Use the smart processor
103103
new HtmlMetadataExtractorMiddleware(),
104104
new HtmlLinkExtractorMiddleware(), // Extract links before cleaning
105105
new HtmlSanitizerMiddleware(),

src/scraper/types.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ export interface ScraperOptions {
3939
ignoreErrors?: boolean;
4040
/** CSS selectors for elements to exclude during HTML processing */
4141
excludeSelectors?: string[];
42+
/**
43+
* Determines the HTML processing strategy.
44+
* - 'fetch': Use a simple DOM parser (faster, less JS support).
45+
* - 'playwright': Use a headless browser (slower, full JS support).
46+
* - 'auto': Automatically select the best strategy (currently defaults to 'playwright').
47+
* @default 'auto'
48+
*/
49+
scrapeMode?: "fetch" | "playwright" | "auto";
4250
/** Optional AbortSignal for cancellation */
4351
signal?: AbortSignal;
4452
}

src/tools/ScrapeTool.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ describe("ScrapeTool", () => {
143143
maxDepth: 2, // Overridden
144144
maxConcurrency: 5, // Test override
145145
ignoreErrors: false, // Overridden
146+
scrapeMode: "auto", // Add expected scrapeMode
146147
},
147148
);
148149
expect(mockManagerInstance.waitForJobCompletion).toHaveBeenCalledWith(MOCK_JOB_ID);

0 commit comments

Comments
 (0)