feat(scraper): implement configurable subpage scraping behavior

arabold · arabold · commit 1dc2a1186021 · 2025-03-27T10:15:17.000-07:00
Added explicit support for controlling subpage scraping behavior through `subpagesOnly` option.
Previously this was hardcoded to true, now it can be configured:

- Added `subpagesOnly` to ScrapeTool options and pipeline configuration
- Enhanced WebScraperStrategy to properly handle the subpagesOnly setting
- Added comprehensive tests for both subpagesOnly=true/false scenarios
- Added cross-origin link filtering for better security
- Fixed error log sequencing in DocumentManagementService

This change enables more flexibility in documentation scraping while maintaining security through origin checks.
diff --git a/src/mcp/index.ts b/src/mcp/index.ts
@@ -60,7 +60,7 @@ export async function startServer() {
           .default(true)
           .describe("Only scrape pages under the initial URL path"),
       },
-      async ({ url, library, version, maxPages, maxDepth }) => {
+      async ({ url, library, version, maxPages, maxDepth, subpagesOnly }) => {
         try {
           const result = await tools.scrape.execute({
             url,
@@ -69,6 +69,7 @@ export async function startServer() {
             options: {
               maxPages,
               maxDepth,
+              subpagesOnly,
             },
           });
 
diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts
@@ -1,11 +1,27 @@
-import { beforeEach, describe, expect, it, vi } from "vitest";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { HttpFetcher } from "../fetcher";
+import { HtmlProcessor, type ProcessedContent } from "../processor";
 import type { ScraperOptions } from "../types";
 import { WebScraperStrategy } from "./WebScraperStrategy";
+
+// Mock the processor module
+vi.mock("../processor");
+
 describe("WebScraperStrategy", () => {
   let options: ScraperOptions;
+  let defaultProcessResult: ProcessedContent;
 
   beforeEach(() => {
+    // Provide a default valid result for the processor mock
+    defaultProcessResult = {
+      content: "Default mock content",
+      title: "Default Title",
+      source: "mock-source",
+      links: [],
+      metadata: {},
+    };
+    vi.spyOn(HtmlProcessor.prototype, "process").mockResolvedValue(defaultProcessResult);
+
     options = {
       url: "https://example.com",
       library: "test",
@@ -68,4 +84,139 @@ describe("WebScraperStrategy", () => {
 
     expect(HttpFetcher.prototype.fetch).toHaveBeenCalledTimes(1);
   });
+
+  // Restore mocks after each test
+  afterEach(() => {
+    // Restore mocks *before* the next test runs its beforeEach
+    vi.restoreAllMocks();
+  });
+
+  it("should only follow subpage links when subpagesOnly is true (default)", async () => {
+    const strategy = new WebScraperStrategy();
+    const options: ScraperOptions = {
+      url: "https://example.com/docs/", // Base path with trailing slash
+      library: "test",
+      version: "1.0",
+      maxPages: 5, // Allow multiple pages
+      maxDepth: 2, // Allow following links
+      subpagesOnly: true, // Explicitly true (also default)
+    };
+    const progressCallback = vi.fn();
+
+    const fetchSpy = vi
+      .spyOn(HttpFetcher.prototype, "fetch")
+      .mockImplementation(async (url) => ({
+        // Simple fetch mock, processor will provide links
+        content: `Content for ${url}`,
+        mimeType: "text/html",
+        source: url,
+      }));
+
+    // Mock HtmlProcessor to return specific links for the root URL
+    const processSpy = vi
+      .spyOn(HtmlProcessor.prototype, "process")
+      .mockImplementation(async (rawContent) => {
+        if (rawContent.source === "https://example.com/docs/") {
+          return {
+            content: "Processed content",
+            title: "Docs Index",
+            source: rawContent.source,
+            links: [
+              "https://example.com/docs/page1", // Subpage
+              "https://example.com/other/page2", // Outside /docs/
+              "https://example.com/docs/page3/", // Subpage with slash
+              "https://anothersite.com/", // Cross-origin link
+              "/docs/relative", // Relative subpage
+              "/other/relative", // Relative outside
+            ],
+            metadata: {},
+          };
+        }
+        // Return no links for subsequent pages
+        return {
+          content: "Processed subpage",
+          title: "Subpage",
+          source: rawContent.source,
+          links: [],
+          metadata: {},
+        };
+      });
+
+    await strategy.scrape(options, progressCallback);
+
+    // Should fetch: root + page1 + page3 + relative
+    // Should fetch: root + page1 + page3 + relative (4 total)
+    expect(fetchSpy).toHaveBeenCalledTimes(4);
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page1");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page3/");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/relative");
+    expect(fetchSpy).not.toHaveBeenCalledWith("https://example.com/other/page2");
+    expect(fetchSpy).not.toHaveBeenCalledWith("https://anothersite.com/");
+    expect(fetchSpy).not.toHaveBeenCalledWith("https://example.com/other/relative");
+    expect(processSpy).toHaveBeenCalledTimes(4); // Processor called for each fetched page
+  });
+
+  it("should follow links outside base path when subpagesOnly is false", async () => {
+    const strategy = new WebScraperStrategy();
+    const options: ScraperOptions = {
+      url: "https://example.com/docs/",
+      library: "test",
+      version: "1.0",
+      maxPages: 5,
+      maxDepth: 2,
+      subpagesOnly: false, // Explicitly false
+    };
+    const progressCallback = vi.fn();
+
+    const fetchSpy = vi
+      .spyOn(HttpFetcher.prototype, "fetch")
+      .mockImplementation(async (url) => ({
+        // Simple fetch mock
+        content: `Content for ${url}`,
+        mimeType: "text/html",
+        source: url,
+      }));
+
+    // Mock HtmlProcessor to return specific links for the root URL
+    const processSpy = vi
+      .spyOn(HtmlProcessor.prototype, "process")
+      .mockImplementation(async (rawContent) => {
+        if (rawContent.source === "https://example.com/docs/") {
+          return {
+            content: "Processed content",
+            title: "Docs Index",
+            source: rawContent.source,
+            links: [
+              "https://example.com/docs/page1", // Subpage
+              "https://example.com/other/page2", // Outside /docs/
+              "https://anothersite.com/", // Cross-origin link
+              "/docs/relative", // Relative subpage
+              "/other/relative", // Relative outside
+            ],
+            metadata: {},
+          };
+        }
+        // Return no links for subsequent pages
+        return {
+          content: "Processed subpage",
+          title: "Subpage",
+          source: rawContent.source,
+          links: [],
+          metadata: {},
+        };
+      });
+
+    await strategy.scrape(options, progressCallback);
+
+    // Should fetch: root + page1 + page2 + relative_sub + relative_other (5 total)
+    expect(fetchSpy).toHaveBeenCalledTimes(5);
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page1");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/other/page2"); // Included now
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/relative");
+    expect(fetchSpy).toHaveBeenCalledWith("https://example.com/other/relative"); // Included now
+    expect(fetchSpy).not.toHaveBeenCalledWith("https://anothersite.com/"); // Link removed from mock
+    expect(processSpy).toHaveBeenCalledTimes(5); // Processor called for each fetched page
+  });
 });
diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts
@@ -1,8 +1,8 @@
-import type { Document, ProgressCallback } from "../../types";
+import type { Document } from "../../types";
 import { logger } from "../../utils/logger";
 import type { UrlNormalizerOptions } from "../../utils/url";
 import { HttpFetcher } from "../fetcher";
-import type { ScraperOptions, ScraperProgress } from "../types";
+import type { ScraperOptions } from "../types";
 import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy";
 
 export interface WebScraperStrategyOptions {
@@ -54,6 +54,11 @@ export class WebScraperStrategy extends BaseScraperStrategy {
       const links = result.links.filter((link) => {
         try {
           const targetUrl = new URL(link, baseUrl);
+          // Always ensure the target is on the same origin
+          if (targetUrl.origin !== baseUrl.origin) {
+            return false;
+          }
+          // Apply subpagesOnly and custom filter logic
           return (
             (!options.subpagesOnly || this.isSubpage(baseUrl, targetUrl)) &&
             (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl))
diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts
@@ -107,12 +107,12 @@ export class DocumentManagementService {
     ); // listVersions already filters for semver
 
     if (validSemverVersions.length === 0) {
-      logger.warn(`⚠️ No valid semver versions found for ${library}`);
       if (hasUnversioned) {
         logger.info(`ℹ️ Unversioned documents exist for ${library}`);
         return { bestMatch: null, hasUnversioned: true };
       }
       // Throw error only if NO versions (semver or unversioned) exist
+      logger.warn(`⚠️ No valid versions found for ${library}`);
       throw new VersionNotFoundError(library, targetVersion ?? "", []);
     }
 
diff --git a/src/tools/ScrapeTool.test.ts b/src/tools/ScrapeTool.test.ts
@@ -22,7 +22,7 @@ describe("ScrapeTool", () => {
   // Mock implementation for pipeline callbacks
   let pipelineCallbacks: {
     onProgress?: (progress: ScraperProgress) => Promise<void>;
-    onError?: (error: Error, doc?: any) => Promise<void>;
+    onError?: (error: Error, doc?: unknown) => Promise<void>;
   } = {};
 
   beforeEach(() => {
@@ -94,14 +94,14 @@ describe("ScrapeTool", () => {
     async (invalidVersion) => {
       const options = getBaseOptions(invalidVersion);
 
-       await expect(scrapeTool.execute(options)).rejects.toThrow(
-         /Invalid version format for scraping/,
-       );
-       // Initialize IS called before the version check throws
-+      expect(mockDocService.initialize).toHaveBeenCalledOnce();
-       expect(mockDocService.removeAllDocuments).not.toHaveBeenCalled();
-       expect(mockPipelineInstance.process).not.toHaveBeenCalled();
-     },
+      await expect(scrapeTool.execute(options)).rejects.toThrow(
+        /Invalid version format for scraping/,
+      );
+      // Initialize IS called before the version check throws
+      +expect(mockDocService.initialize).toHaveBeenCalledOnce();
+      expect(mockDocService.removeAllDocuments).not.toHaveBeenCalled();
+      expect(mockPipelineInstance.process).not.toHaveBeenCalled();
+    },
   );
 
   // --- Pipeline Execution Tests ---
@@ -134,8 +134,20 @@ describe("ScrapeTool", () => {
     // Simulate progress callback updating pagesScraped
     (mockPipelineInstance.process as Mock).mockImplementation(async () => {
       if (pipelineCallbacks.onProgress) {
-        await pipelineCallbacks.onProgress({ pagesScraped: 10, maxPages: 100, currentUrl: "url1", depth: 1, maxDepth: 3 });
-        await pipelineCallbacks.onProgress({ pagesScraped: 25, maxPages: 100, currentUrl: "url2", depth: 2, maxDepth: 3 });
+        await pipelineCallbacks.onProgress({
+          pagesScraped: 10,
+          maxPages: 100,
+          currentUrl: "url1",
+          depth: 1,
+          maxDepth: 3,
+        });
+        await pipelineCallbacks.onProgress({
+          pagesScraped: 25,
+          maxPages: 100,
+          currentUrl: "url2",
+          depth: 2,
+          maxDepth: 3,
+        });
       }
     });
 
@@ -159,7 +171,13 @@ describe("ScrapeTool", () => {
     (mockPipelineInstance.process as Mock).mockImplementation(async () => {
       // Simulate pipeline calling its progress callback
       if (pipelineCallbacks.onProgress) {
-        await pipelineCallbacks.onProgress({ pagesScraped: 5, maxPages: 10, currentUrl: "http://page.com", depth: 1, maxDepth: 2 });
+        await pipelineCallbacks.onProgress({
+          pagesScraped: 5,
+          maxPages: 10,
+          currentUrl: "http://page.com",
+          depth: 1,
+          maxDepth: 2,
+        });
       }
     });
 
@@ -172,7 +190,7 @@ describe("ScrapeTool", () => {
   });
 
   it("should call onProgress callback when pipeline reports an error", async () => {
-     const options = getBaseOptions("1.0.0", mockOnProgress);
+    const options = getBaseOptions("1.0.0", mockOnProgress);
     const docError = new Error("Failed to parse");
     (mockPipelineInstance.process as Mock).mockImplementation(async () => {
       // Simulate pipeline calling its error callback
@@ -184,24 +202,34 @@ describe("ScrapeTool", () => {
     await scrapeTool.execute(options);
 
     expect(mockOnProgress).toHaveBeenCalledOnce();
-     expect(mockOnProgress).toHaveBeenCalledWith({
-      content: [{ type: "text", text: expect.stringContaining("Error processing Bad Doc: Failed to parse") }],
+    expect(mockOnProgress).toHaveBeenCalledWith({
+      content: [
+        {
+          type: "text",
+          text: expect.stringContaining("Error processing Bad Doc: Failed to parse"),
+        },
+      ],
     });
   });
 
-   it("should not fail if onProgress is not provided", async () => {
+  it("should not fail if onProgress is not provided", async () => {
     const options = getBaseOptions("1.0.0"); // No onProgress callback
-     (mockPipelineInstance.process as Mock).mockImplementation(async () => {
+    (mockPipelineInstance.process as Mock).mockImplementation(async () => {
       if (pipelineCallbacks.onProgress) {
-        await pipelineCallbacks.onProgress({ pagesScraped: 1, maxPages: 10, currentUrl: "url", depth: 0, maxDepth: 1 });
+        await pipelineCallbacks.onProgress({
+          pagesScraped: 1,
+          maxPages: 10,
+          currentUrl: "url",
+          depth: 0,
+          maxDepth: 1,
+        });
       }
-       if (pipelineCallbacks.onError) {
+      if (pipelineCallbacks.onError) {
         await pipelineCallbacks.onError(new Error("Test Error"));
       }
     });
 
     // Expect no error to be thrown during execution when callbacks fire internally
     await expect(scrapeTool.execute(options)).resolves.toBeDefined();
   });
-
 });
diff --git a/src/tools/ScrapeTool.ts b/src/tools/ScrapeTool.ts
@@ -13,6 +13,7 @@ export interface ScrapeToolOptions {
   options?: {
     maxPages?: number;
     maxDepth?: number;
+    subpagesOnly?: boolean;
     maxConcurrency?: number;
     ignoreErrors?: boolean;
   };
@@ -115,7 +116,7 @@ export class ScrapeTool {
       url: url,
       library: library,
       version: internalVersion, // Pass the normalized internal version to the pipeline process
-      subpagesOnly: true,
+      subpagesOnly: scraperOptions?.subpagesOnly ?? true, // Use passed value or default
       maxPages: scraperOptions?.maxPages ?? 100,
       maxDepth: scraperOptions?.maxDepth ?? 3,
       maxConcurrency: scraperOptions?.maxConcurrency ?? 3,

Original file line number	Diff line number	Diff line change
`@@ -107,12 +107,12 @@ export class DocumentManagementService {`
`107`	`107`	`); // listVersions already filters for semver`
`108`	`108`
`109`	`109`	`if (validSemverVersions.length === 0) {`
`110`		- logger.warn(`⚠️ No valid semver versions found for ${library}`);
`111`	`110`	`if (hasUnversioned) {`
`112`	`111`	logger.info(`ℹ️ Unversioned documents exist for ${library}`);
`113`	`112`	`return { bestMatch: null, hasUnversioned: true };`
`114`	`113`	`}`
`115`	`114`	`// Throw error only if NO versions (semver or unversioned) exist`
	`115`	+ logger.warn(`⚠️ No valid versions found for ${library}`);
`116`	`116`	`throw new VersionNotFoundError(library, targetVersion ?? "", []);`
`117`	`117`	`}`
`118`	`118`