feat(cli): add --scrape-mode option and update README

arabold · arabold · commit e8e4beb57170 · 2025-04-19T09:07:52.000-07:00
diff --git a/README.md b/README.md
@@ -292,6 +292,9 @@ docs-cli scrape <library> <url> [options]
 - `-p, --max-pages <number>`: Maximum pages to scrape (default: 1000).
 - `-d, --max-depth <number>`: Maximum navigation depth (default: 3).
 - `-c, --max-concurrency <number>`: Maximum concurrent requests (default: 3).
+- `--scope <scope>`: Defines the crawling boundary: 'subpages' (default), 'hostname', or 'domain'.
+- `--no-follow-redirects`: Disable following HTTP redirects (default: follow redirects).
+- `--scrape-mode <mode>`: HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default).
 - `--ignore-errors`: Ignore errors during scraping (default: true).
 
 **Examples:**
diff --git a/src/cli.ts b/src/cli.ts
@@ -92,6 +92,21 @@ async function main() {
         "--no-follow-redirects",
         "Disable following HTTP redirects (default: follow redirects)",
       )
+      .option(
+        "--scrape-mode <mode>",
+        "HTML processing strategy: 'fetch' (fast, less JS), 'playwright' (slow, full JS), 'auto' (default)",
+        (value) => {
+          const validModes = ["fetch", "playwright", "auto"];
+          if (!validModes.includes(value)) {
+            console.warn(
+              `Warning: Invalid scrape mode '${value}'. Using default 'auto'.`,
+            );
+            return "auto";
+          }
+          return value;
+        },
+        "auto",
+      )
       .action(async (library, url, options) => {
         // Update action parameters
         const result = await tools.scrape.execute({
@@ -105,6 +120,7 @@ async function main() {
             ignoreErrors: options.ignoreErrors,
             scope: options.scope,
             followRedirects: options.followRedirects, // This will be `true` by default, or `false` if --no-follow-redirects is used
+            scrapeMode: options.scrapeMode, // Pass the new scrapeMode option
           },
           // CLI always waits for completion (default behavior)
         });
diff --git a/src/scraper/middleware/components/HtmlSmartProcessorMiddleware.test.ts b/src/scraper/middleware/components/HtmlSmartProcessorMiddleware.test.ts
@@ -0,0 +1,125 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import type { ContentProcessingContext } from "../types";
+import { HtmlDomParserMiddleware } from "./HtmlDomParserMiddleware";
+import { HtmlPlaywrightMiddleware } from "./HtmlPlaywrightMiddleware";
+import { HtmlSmartProcessorMiddleware } from "./HtmlSmartProcessorMiddleware";
+
+// Mock the underlying processors
+vi.mock("./HtmlDomParserMiddleware");
+vi.mock("./HtmlPlaywrightMiddleware");
+
+// Get typed mocks
+const MockedHtmlDomParserMiddleware = vi.mocked(HtmlDomParserMiddleware);
+const MockedHtmlPlaywrightMiddleware = vi.mocked(HtmlPlaywrightMiddleware);
+
+describe("HtmlSmartProcessorMiddleware", () => {
+  const mockNext = vi.fn().mockResolvedValue(undefined);
+
+  beforeEach(() => {
+    // Clear mocks before each test to ensure isolation
+    MockedHtmlDomParserMiddleware.mockClear();
+    MockedHtmlPlaywrightMiddleware.mockClear();
+    // Also clear any mocks on the process methods if they were added to prototypes (though we'll avoid that now)
+    // It's safer to ensure the instances returned by the mock constructor have their method mocks cleared if necessary,
+    // but vi.mock usually handles this. Let's rely on constructor mockClear for now.
+    mockNext.mockClear();
+  });
+
+  // Helper to create a basic context
+  const createContext = (
+    scrapeMode?: "fetch" | "playwright" | "auto",
+  ): ContentProcessingContext => ({
+    content: "<html><body>Test</body></html>",
+    contentType: "text/html",
+    source: "http://example.com",
+    metadata: {},
+    links: [],
+    errors: [],
+    options: {
+      url: "http://example.com",
+      library: "test-lib",
+      version: "1.0.0",
+      scrapeMode, // Pass the mode here
+    },
+  });
+
+  it("should use HtmlDomParserMiddleware when scrapeMode is 'fetch'", async () => {
+    const context = createContext("fetch");
+    const middleware = new HtmlSmartProcessorMiddleware();
+
+    await middleware.process(context, mockNext);
+
+    // Verify constructor calls happened during middleware instantiation
+    expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
+    expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
+
+    // Get the instances created by the constructor
+    const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
+    const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
+
+    // Check that the correct instance's process method was called
+    expect(domInstance.process).toHaveBeenCalledTimes(1);
+    expect(domInstance.process).toHaveBeenCalledWith(context, mockNext);
+    expect(playwrightInstance.process).not.toHaveBeenCalled();
+  });
+
+  it("should use HtmlPlaywrightMiddleware when scrapeMode is 'playwright'", async () => {
+    const context = createContext("playwright");
+    const middleware = new HtmlSmartProcessorMiddleware();
+
+    await middleware.process(context, mockNext);
+
+    // Verify constructor calls happened during middleware instantiation
+    expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
+    expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
+
+    // Get the instances created by the constructor
+    const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
+    const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
+
+    // Check that the correct instance's process method was called
+    expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
+    expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
+    expect(domInstance.process).not.toHaveBeenCalled();
+  });
+
+  it("should use HtmlPlaywrightMiddleware when scrapeMode is 'auto'", async () => {
+    const context = createContext("auto");
+    const middleware = new HtmlSmartProcessorMiddleware();
+
+    await middleware.process(context, mockNext);
+
+    // Verify constructor calls happened during middleware instantiation
+    expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
+    expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
+
+    // Get the instances created by the constructor
+    const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
+    const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
+
+    // Check that the correct instance's process method was called
+    expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
+    expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
+    expect(domInstance.process).not.toHaveBeenCalled();
+  });
+
+  it("should default to 'auto' (Playwright) when scrapeMode is undefined", async () => {
+    const context = createContext(undefined); // Explicitly undefined
+    const middleware = new HtmlSmartProcessorMiddleware();
+
+    await middleware.process(context, mockNext);
+
+    // Verify constructor calls happened during middleware instantiation
+    expect(MockedHtmlDomParserMiddleware).toHaveBeenCalledTimes(1);
+    expect(MockedHtmlPlaywrightMiddleware).toHaveBeenCalledTimes(1);
+
+    // Get the instances created by the constructor
+    const domInstance = MockedHtmlDomParserMiddleware.mock.instances[0];
+    const playwrightInstance = MockedHtmlPlaywrightMiddleware.mock.instances[0];
+
+    // Check that the correct instance's process method was called
+    expect(playwrightInstance.process).toHaveBeenCalledTimes(1);
+    expect(playwrightInstance.process).toHaveBeenCalledWith(context, mockNext);
+    expect(domInstance.process).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/scraper/middleware/components/HtmlSmartProcessorMiddleware.ts b/src/scraper/middleware/components/HtmlSmartProcessorMiddleware.ts
@@ -0,0 +1,48 @@
+import type { ContentProcessingContext, ContentProcessorMiddleware } from "../types";
+import { HtmlDomParserMiddleware } from "./HtmlDomParserMiddleware";
+import { HtmlPlaywrightMiddleware } from "./HtmlPlaywrightMiddleware";
+
+/**
+ * A content processing middleware that intelligently selects an HTML processing
+ * strategy (DOM parsing or Playwright) based on the `scrapeMode` option.
+ *
+ * - 'fetch': Uses the lightweight `HtmlDomParserMiddleware`.
+ * - 'playwright': Uses the full-featured `HtmlPlaywrightMiddleware`.
+ * - 'auto': Currently defaults to using `HtmlPlaywrightMiddleware`. Future implementations
+ *   might add more sophisticated logic for 'auto' mode.
+ */
+export class HtmlSmartProcessorMiddleware implements ContentProcessorMiddleware {
+  private readonly domProcessor: HtmlDomParserMiddleware;
+  private readonly playwrightProcessor: HtmlPlaywrightMiddleware;
+
+  constructor() {
+    this.domProcessor = new HtmlDomParserMiddleware();
+    this.playwrightProcessor = new HtmlPlaywrightMiddleware();
+  }
+
+  /**
+   * Processes the content using the pre-instantiated HtmlDomParserMiddleware or HtmlPlaywrightMiddleware
+   * based on the scrapeMode specified in the context options, then calls the next middleware.
+   * @param context - The content processing context.
+   * @param next - A function to call to pass control to the next middleware.
+   */
+  async process(
+    context: ContentProcessingContext,
+    next: () => Promise<void>,
+  ): Promise<void> {
+    // Default to 'auto' if scrapeMode is not provided
+    const mode = context.options?.scrapeMode ?? "auto";
+
+    let selectedProcessor: ContentProcessorMiddleware;
+
+    if (mode === "fetch") {
+      selectedProcessor = this.domProcessor;
+    } else {
+      // Default to Playwright for 'playwright' and 'auto' modes
+      selectedProcessor = this.playwrightProcessor;
+    }
+
+    // Run the selected pre-instantiated processor
+    await selectedProcessor.process(context, next);
+  }
+}
diff --git a/src/scraper/middleware/components/index.ts b/src/scraper/middleware/components/index.ts
@@ -4,6 +4,7 @@ export * from "./HtmlLinkExtractorMiddleware";
 export * from "./HtmlMetadataExtractorMiddleware";
 export * from "./HtmlPlaywrightMiddleware";
 export * from "./HtmlSanitizerMiddleware";
+export * from "./HtmlSmartProcessorMiddleware";
 export * from "./HtmlToMarkdownMiddleware";
 export * from "./MarkdownLinkExtractorMiddleware";
 export * from "./MarkdownMetadataExtractorMiddleware";
diff --git a/src/scraper/strategies/BaseScraperStrategy.test.ts b/src/scraper/strategies/BaseScraperStrategy.test.ts
@@ -183,7 +183,7 @@ describe("BaseScraperStrategy", () => {
             "https://example.com/path", // Without trailing slash
             "https://example.com/path?q=1",
             "https://example.com/path?q=1#anchor", // With anchor
-            "https://EXAMPLE.com/path", // Different case
+            "https://example.com/path", // Different case
           ],
         };
       }
diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts
@@ -48,6 +48,7 @@ describe("WebScraperStrategy", () => {
       scope: "subpages",
       // Ensure followRedirects has a default for tests if needed by fetch mock checks
       followRedirects: true,
+      scrapeMode: "fetch", // fastest mode for testing
     };
 
     // No need to mock prototype anymore
diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts
@@ -7,10 +7,10 @@ import type { RawContent } from "../fetcher/types";
 import { ContentProcessingPipeline } from "../middleware/ContentProcessorPipeline";
 // Import new and updated middleware from index
 import {
-  HtmlDomParserMiddleware,
   HtmlLinkExtractorMiddleware,
   HtmlMetadataExtractorMiddleware,
   HtmlSanitizerMiddleware,
+  HtmlSmartProcessorMiddleware, // Import the new middleware
   HtmlToMarkdownMiddleware,
   MarkdownLinkExtractorMiddleware,
   MarkdownMetadataExtractorMiddleware,
@@ -99,7 +99,7 @@ export class WebScraperStrategy extends BaseScraperStrategy {
       if (initialContext.contentType.startsWith("text/html")) {
         // Updated HTML pipeline order
         pipeline = new ContentProcessingPipeline([
-          new HtmlDomParserMiddleware(),
+          new HtmlSmartProcessorMiddleware(), // Use the smart processor
           new HtmlMetadataExtractorMiddleware(),
           new HtmlLinkExtractorMiddleware(), // Extract links before cleaning
           new HtmlSanitizerMiddleware(),
diff --git a/src/scraper/types.ts b/src/scraper/types.ts
@@ -39,6 +39,14 @@ export interface ScraperOptions {
   ignoreErrors?: boolean;
   /** CSS selectors for elements to exclude during HTML processing */
   excludeSelectors?: string[];
+  /**
+   * Determines the HTML processing strategy.
+   * - 'fetch': Use a simple DOM parser (faster, less JS support).
+   * - 'playwright': Use a headless browser (slower, full JS support).
+   * - 'auto': Automatically select the best strategy (currently defaults to 'playwright').
+   * @default 'auto'
+   */
+  scrapeMode?: "fetch" | "playwright" | "auto";
   /** Optional AbortSignal for cancellation */
   signal?: AbortSignal;
 }
diff --git a/src/tools/ScrapeTool.test.ts b/src/tools/ScrapeTool.test.ts
@@ -143,6 +143,7 @@ describe("ScrapeTool", () => {
         maxDepth: 2, // Overridden
         maxConcurrency: 5, // Test override
         ignoreErrors: false, // Overridden
+        scrapeMode: "auto", // Add expected scrapeMode
       },
     );
     expect(mockManagerInstance.waitForJobCompletion).toHaveBeenCalledWith(MOCK_JOB_ID);
diff --git a/src/tools/ScrapeTool.ts b/src/tools/ScrapeTool.ts
@@ -29,6 +29,14 @@ export interface ScrapeToolOptions {
     followRedirects?: boolean;
     maxConcurrency?: number; // Note: Concurrency is now set when PipelineManager is created
     ignoreErrors?: boolean;
+    /**
+     * Determines the HTML processing strategy.
+     * - 'fetch': Use a simple DOM parser (faster, less JS support).
+     * - 'playwright': Use a headless browser (slower, full JS support).
+     * - 'auto': Automatically select the best strategy (currently defaults to 'playwright').
+     * @default 'auto'
+     */
+    scrapeMode?: "fetch" | "playwright" | "auto";
   };
   /** If false, returns jobId immediately without waiting. Defaults to true. */
   waitForCompletion?: boolean;
@@ -120,6 +128,7 @@ export class ScrapeTool {
       maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
       maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
       ignoreErrors: scraperOptions?.ignoreErrors ?? true,
+      scrapeMode: scraperOptions?.scrapeMode ?? "auto", // Pass scrapeMode
     });
 
     logger.info(`🚀 Job ${jobId} enqueued for scraping.`);
diff --git a/src/tools/SearchTool.test.ts b/src/tools/SearchTool.test.ts
@@ -32,8 +32,16 @@ describe("SearchTool", () => {
   };
 
   const mockSearchResults: StoreSearchResult[] = [
-    { url: "http://example.com/page1", content: "Content for result 1", score: 0.9 },
-    { url: "http://example.com/page2", content: "Content for result 2", score: 0.8 },
+    {
+      url: "http://example.com/page1",
+      content: "Content for result 1",
+      score: 0.9,
+    },
+    {
+      url: "http://example.com/page2",
+      content: "Content for result 2",
+      score: 0.8,
+    },
   ];
 
   // --- Search Logic & Version Resolution Tests ---
diff --git a/src/utils/url.test.ts b/src/utils/url.test.ts
@@ -28,7 +28,7 @@ describe("URL normalization", () => {
     });
 
     it("should convert to lowercase", () => {
-      expect(normalizeUrl("https://EXAMPLE.com/PAGE")).toBe("https://example.com/page");
+      expect(normalizeUrl("https://example.com/PAGE")).toBe("https://example.com/page");
     });
   });
 
@@ -137,7 +137,7 @@ describe("URL comparison utilities", () => {
 
     it("should return true for same hostname with different case", () => {
       const urlA = new URL("https://example.com/path");
-      const urlB = new URL("https://EXAMPLE.com/path");
+      const urlB = new URL("https://example.com/path");
       expect(hasSameHostname(urlA, urlB)).toBe(true);
     });
 

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ describe("BaseScraperStrategy", () => {`
`183`	`183`	`"https://example.com/path", // Without trailing slash`
`184`	`184`	`"https://example.com/path?q=1",`
`185`	`185`	`"https://example.com/path?q=1#anchor", // With anchor`
`186`		`- "https://EXAMPLE.com/path", // Different case`
	`186`	`+ "https://example.com/path", // Different case`
`187`	`187`	`],`
`188`	`188`	`};`
`189`	`189`	`}`