feat: add configurable concurrency for web scraping

arabold · arabold · commit f6c3baab8667 · 2025-03-14T06:16:31.000-07:00
- Add maxConcurrency option to control parallel page requests
- Implement batch-based concurrent scraping in DefaultScraperStrategy
- Update CLI and types to support configurable concurrency
- Document concurrency model in architecture
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -18,15 +18,15 @@ The Documentation MCP Server is designed with a modular architecture that ensure
 ```
 src/
 ├── cli.ts                 # CLI interface implementation
-├── index.ts              # MCP server interface
-├── pipeline/             # Document processing pipeline
-├── scraper/             # Web scraping implementation
-│   └── strategies/      # Scraping strategies for different sources
-├── splitter/            # Document splitting and chunking
-├── store/              # Vector store and document storage
-├── tools/              # Core functionality tools
-├── types/              # Shared type definitions
-└── utils/              # Common utilities and helpers
+├── index.ts               # MCP server interface
+├── pipeline/              # Document processing pipeline
+├── scraper/               # Web scraping implementation
+│   └── strategies/        # Scraping strategies for different sources
+├── splitter/              # Document splitting and chunking
+├── store/                 # Vector store and document storage
+├── tools/                 # Core functionality tools
+├── types/                 # Shared type definitions
+└── utils/                 # Common utilities and helpers
 ```
 
 ## Tools Layer
@@ -67,7 +67,7 @@ The project uses a unified progress reporting system with typed callbacks for al
 - Provides real-time feedback across multiple levels (page, document, storage)
 - Ensures consistent progress tracking across components
 - Supports different output formats for CLI and MCP interfaces
-- Enables parallel processing with individual progress tracking
+- Enables parallel processing with individual progress tracking through configurable batch-based concurrency
 
 ### Logging Strategy
 
diff --git a/src/cli.ts b/src/cli.ts
@@ -44,6 +44,11 @@ async function main() {
       .description("Scrape and index documentation from a URL")
       .option("-p, --max-pages <number>", "Maximum pages to scrape", "100")
       .option("-d, --max-depth <number>", "Maximum navigation depth", "3")
+      .option(
+        "-c, --max-concurrency <number>",
+        "Maximum concurrent page requests",
+        "3"
+      )
       .option(
         "--subpages-only",
         "Allow scraping pages outside the initial URL path",
@@ -57,6 +62,7 @@ async function main() {
           options: {
             maxPages: Number.parseInt(options.maxPages),
             maxDepth: Number.parseInt(options.maxDepth),
+            maxConcurrency: Number.parseInt(options.maxConcurrency),
           },
         });
         console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
diff --git a/src/scraper/strategies/DefaultScraperStrategy.ts b/src/scraper/strategies/DefaultScraperStrategy.ts
@@ -55,6 +55,97 @@ export class DefaultScraperStrategy implements ScraperStrategy {
     }
   }
 
+  private async processUrl(
+    item: { url: string; depth: number },
+    options: ScrapeOptions,
+    progressCallback?: ProgressCallback<ScrapingProgress>
+  ): Promise<string[]> {
+    const { url, depth } = item;
+    const normalizedUrl = normalizeUrl(url, this.urlNormalizerOptions);
+
+    logger.info(
+      `🌐 Scraping page ${this.pageCount}/${options.maxPages} (depth ${depth}/${options.maxDepth}): ${normalizedUrl}`
+    );
+
+    try {
+      const result = await this.htmlScraper.scrapePageWithRetry(url);
+
+      // Convert and emit the document immediately
+      await progressCallback?.({
+        pagesScraped: this.pageCount,
+        maxPages: options.maxPages,
+        currentUrl: normalizedUrl,
+        depth,
+        maxDepth: options.maxDepth,
+        document: {
+          content: result.content,
+          metadata: {
+            url: result.url,
+            title: result.title,
+            library: options.library,
+            version: options.version,
+          } satisfies ScraperMetadata,
+        },
+      });
+
+      // Return links to be processed by the main loop
+      return result.links;
+    } catch (error) {
+      logger.error(`Failed to scrape page ${url}: ${error}`);
+      return [];
+    }
+  }
+
+  private async processBatch(
+    batch: Array<{ url: string; depth: number }>,
+    baseUrl: URL,
+    options: ScrapeOptions,
+    progressCallback?: ProgressCallback<ScrapingProgress>
+  ): Promise<Array<{ url: string; depth: number }>> {
+    // Process all URLs in the batch concurrently
+    const results = await Promise.all(
+      batch.map(async (item) => {
+        // Increment page count before processing each URL
+        this.pageCount++;
+        const links = await this.processUrl(item, options, progressCallback);
+
+        if (item.depth < options.maxDepth) {
+          return links
+            .map((link) => {
+              try {
+                const targetUrl = new URL(link);
+                const normalizedLink = normalizeUrl(
+                  link,
+                  this.urlNormalizerOptions
+                );
+
+                if (
+                  !this.visited.has(normalizedLink) &&
+                  (!options.subpagesOnly ||
+                    this.isSubpage(baseUrl, targetUrl)) &&
+                  (!this.shouldFollowLinkFn ||
+                    this.shouldFollowLinkFn(baseUrl, targetUrl))
+                ) {
+                  this.visited.add(normalizedLink);
+                  return { url: link, depth: item.depth + 1 };
+                }
+              } catch (error) {
+                // Invalid URL
+              }
+              return null;
+            })
+            .filter(
+              (item): item is { url: string; depth: number } => item !== null
+            );
+        }
+        return [];
+      })
+    );
+
+    // Flatten and return all new URLs to process
+    return results.flat();
+  }
+
   async scrape(
     options: ScrapeOptions,
     progressCallback?: ProgressCallback<ScrapingProgress>
@@ -68,78 +159,32 @@ export class DefaultScraperStrategy implements ScraperStrategy {
     ];
 
     // Track URLs we've seen (either queued or visited)
-    // Add starting URL to the tracking set
     this.visited.add(normalizeUrl(options.url, this.urlNormalizerOptions));
 
     while (queue.length > 0 && this.pageCount < options.maxPages) {
-      const current = queue.shift();
-      if (!current) continue;
-
-      const { url, depth } = current;
-      const normalizedUrl = normalizeUrl(url, this.urlNormalizerOptions);
-
-      // Since we track at queueing time, this check is mostly
-      // for safety in case of URL normalization differences
-      if (!this.visited.has(normalizedUrl)) {
-        // This shouldn't happen if our normalization is consistent,
-        // but let's add it to visited to be safe
-        this.visited.add(normalizedUrl);
+      // Take a batch of URLs to process
+      const remainingPages = options.maxPages - this.pageCount;
+      if (remainingPages <= 0) {
+        break;
       }
 
-      this.pageCount++;
-
-      logger.info(
-        `🌐 Scraping page ${this.pageCount}/${options.maxPages} (depth ${depth}/${options.maxDepth}): ${normalizedUrl}`
+      const batchSize = Math.min(
+        options.maxConcurrency ?? 3,
+        remainingPages,
+        queue.length
+      );
+      const batch = queue.splice(0, batchSize);
+
+      // Process the batch and get new URLs
+      const newUrls = await this.processBatch(
+        batch,
+        baseUrl,
+        options,
+        progressCallback
       );
 
-      try {
-        const result = await this.htmlScraper.scrapePageWithRetry(url);
-
-        // Convert and emit the document immediately
-        await progressCallback?.({
-          pagesScraped: this.pageCount,
-          maxPages: options.maxPages,
-          currentUrl: normalizedUrl,
-          depth,
-          maxDepth: options.maxDepth,
-          document: {
-            content: result.content,
-            metadata: {
-              url: result.url,
-              title: result.title,
-              library: options.library,
-              version: options.version,
-            } satisfies ScraperMetadata,
-          },
-        });
-
-        // Queue child pages if we haven't reached max depth
-        if (depth < options.maxDepth) {
-          for (const link of result.links) {
-            const targetUrl = new URL(link);
-            const normalizedLink = normalizeUrl(
-              link,
-              this.urlNormalizerOptions
-            );
-
-            // Skip if already visited or queued (now combined in this.visited)
-            if (
-              this.visited.has(normalizedLink) ||
-              (options.subpagesOnly && !this.isSubpage(baseUrl, targetUrl)) ||
-              (this.shouldFollowLinkFn &&
-                !this.shouldFollowLinkFn(baseUrl, targetUrl))
-            ) {
-              continue;
-            }
-
-            // Add to queue and track immediately in visited set
-            queue.push({ url: link, depth: depth + 1 });
-            this.visited.add(normalizedLink);
-          }
-        }
-      } catch (error) {
-        logger.error(`Failed to scrape page ${url}: ${error}`);
-      }
+      // Add new URLs to the queue
+      queue.push(...newUrls);
     }
   }
 }
diff --git a/src/tools/ScrapeTool.ts b/src/tools/ScrapeTool.ts
@@ -11,6 +11,7 @@ export interface ScrapeToolOptions {
   options?: {
     maxPages?: number;
     maxDepth?: number;
+    maxConcurrency?: number;
   };
 }
 
@@ -83,6 +84,7 @@ export class ScrapeTool {
       version: version,
       maxPages: scraperOptions?.maxPages ?? 100,
       maxDepth: scraperOptions?.maxDepth ?? 3,
+      maxConcurrency: scraperOptions?.maxConcurrency ?? 3,
       subpagesOnly: true,
     });
 
diff --git a/src/types/index.ts b/src/types/index.ts
@@ -20,6 +20,7 @@ export interface ScrapeOptions {
   maxPages: number;
   maxDepth: number;
   subpagesOnly?: boolean;
+  maxConcurrency?: number;
 }
 
 export interface PageResult {

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ export interface ScrapeOptions {`
`20`	`20`	`maxPages: number;`
`21`	`21`	`maxDepth: number;`
`22`	`22`	`subpagesOnly?: boolean;`
	`23`	`+ maxConcurrency?: number;`
`23`	`24`	`}`
`24`	`25`
`25`	`26`	`export interface PageResult {`