Skip to content

Commit cc6465a

Browse files
committed
feat(cli): add --scrape-mode option to fetch-url command
1 parent 924c7e6 commit cc6465a

File tree

3 files changed

+32
-5
lines changed

3 files changed

+32
-5
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,8 @@ docs-cli fetch-url <url> [options]
267267

268268
**Options:**
269269

270-
- `--no-follow-redirects`: Disable following HTTP redirects (default: follow redirects)
270+
- `--no-follow-redirects`: Disable following HTTP redirects (default: follow redirects).
271+
- `--scrape-mode <mode>`: HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default).
271272

272273
**Examples:**
273274

src/cli.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,26 @@ async function main() {
228228
"--no-follow-redirects",
229229
"Disable following HTTP redirects (default: follow redirects)",
230230
)
231+
.option(
232+
"--scrape-mode <mode>",
233+
"HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default)",
234+
(value) => {
235+
const validModes = ["fetch", "playwright", "auto"];
236+
if (!validModes.includes(value)) {
237+
console.warn(
238+
`Warning: Invalid scrape mode '${value}'. Using default 'auto'.`,
239+
);
240+
return "auto";
241+
}
242+
return value;
243+
},
244+
"auto",
245+
)
231246
.action(async (url, options) => {
232247
const content = await tools.fetchUrl.execute({
233248
url,
234249
followRedirects: options.followRedirects,
250+
scrapeMode: options.scrapeMode, // Pass the scrapeMode option
235251
});
236252
console.log(content);
237253
});

src/tools/FetchUrlTool.ts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ import type {
66
} from "../scraper/fetcher";
77
import { ContentProcessingPipeline } from "../scraper/middleware/ContentProcessorPipeline";
88
import {
9-
HtmlDomParserMiddleware,
109
HtmlMetadataExtractorMiddleware,
1110
HtmlSanitizerMiddleware,
11+
HtmlSelectProcessorMiddleware, // Import the new middleware
1212
HtmlToMarkdownMiddleware,
1313
MarkdownMetadataExtractorMiddleware,
1414
} from "../scraper/middleware/components";
@@ -30,6 +30,15 @@ export interface FetchUrlToolOptions {
3030
* @default true
3131
*/
3232
followRedirects?: boolean;
33+
34+
/**
35+
* Determines the HTML processing strategy.
36+
* - 'fetch': Use a simple DOM parser (faster, less JS support).
37+
* - 'playwright': Use a headless browser (slower, full JS support).
38+
* - 'auto': Automatically select the best strategy (currently defaults to 'playwright').
39+
* @default 'auto'
40+
*/
41+
scrapeMode?: "fetch" | "playwright" | "auto";
3342
}
3443

3544
/**
@@ -57,7 +66,7 @@ export class FetchUrlTool {
5766
* @throws {ToolError} If fetching or processing fails
5867
*/
5968
async execute(options: FetchUrlToolOptions): Promise<string> {
60-
const { url } = options;
69+
const { url, scrapeMode = "auto" } = options; // Destructure scrapeMode with default
6170

6271
// Check all fetchers first (helpful for testing and future extensions)
6372
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
@@ -104,17 +113,18 @@ export class FetchUrlTool {
104113
followRedirects: options.followRedirects ?? true,
105114
excludeSelectors: undefined, // Not currently configurable via this tool
106115
ignoreErrors: false,
116+
scrapeMode: scrapeMode, // Pass the scrapeMode
107117
} satisfies ScraperOptions,
108118
};
109119

110120
let pipeline: ContentProcessingPipeline;
111121
if (initialContext.contentType.startsWith("text/html")) {
112122
// Updated HTML pipeline for FetchUrlTool
113123
pipeline = new ContentProcessingPipeline([
114-
new HtmlDomParserMiddleware(),
124+
new HtmlSelectProcessorMiddleware(), // Use the selector middleware
115125
new HtmlMetadataExtractorMiddleware(), // Keep for potential future use, though title isn't returned
116126
// No Link Extractor needed
117-
new HtmlSanitizerMiddleware(), // Renamed instantiation, use default selectors
127+
new HtmlSanitizerMiddleware(), // Use default selectors
118128
new HtmlToMarkdownMiddleware(),
119129
]);
120130
} else if (

0 commit comments

Comments
 (0)