arabold
diff --git a/‎package-lock.json‎
Lines changed: 11 additions & 3 deletions b/‎package-lock.json‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎package.json‎
Lines changed: 2 additions & 0 deletions b/‎package.json‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cli.ts‎
Lines changed: 19 additions & 0 deletions b/‎src/cli.ts‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/mcp/index.ts‎
Lines changed: 11 additions & 6 deletions b/‎src/mcp/index.ts‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎src/scraper/fetcher/HttpFetcher.test.ts‎
Lines changed: 83 additions & 1 deletion b/‎src/scraper/fetcher/HttpFetcher.test.ts‎
Lines changed: 83 additions & 1 deletion
diff --git a/‎src/scraper/fetcher/HttpFetcher.ts‎
Lines changed: 18 additions & 4 deletions b/‎src/scraper/fetcher/HttpFetcher.ts‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎src/scraper/fetcher/types.ts‎
Lines changed: 2 additions & 0 deletions b/‎src/scraper/fetcher/types.ts‎
Lines changed: 2 additions & 0 deletions
@@ -48,6 +48,7 @@
     "jsdom": "^26.0.0",
     "langchain": "0.3.19",
     "pg": "^8.14.0",
+    "psl": "^1.15.0",
     "remark": "^15.0.1",
     "remark-gfm": "^4.0.1",
     "remark-html": "^16.0.1",
@@ -70,6 +71,7 @@
     "@types/node": "^20.17.23",
     "@types/node-fetch": "^2.6.12",
     "@types/pg": "~8.11.11",
+    "@types/psl": "^1.1.3",
     "@types/semver": "^7.5.8",
     "@types/turndown": "^5.0.5",
     "drizzle-kit": "^0.30.5",
 
@@ -54,6 +54,23 @@ async function main() {
       .option("-d, --max-depth <number>", "Maximum navigation depth", "3")
       .option("-c, --max-concurrency <number>", "Maximum concurrent page requests", "3")
       .option("--ignore-errors", "Ignore errors during scraping", true)
+      .option(
+        "--scope <scope>",
+        "Crawling boundary: 'subpages' (default), 'hostname', or 'domain'",
+        (value) => {
+          const validScopes = ["subpages", "hostname", "domain"];
+          if (!validScopes.includes(value)) {
+            console.warn(`Warning: Invalid scope '${value}'. Using default 'subpages'.`);
+            return "subpages";
+          }
+          return value;
+        },
+        "subpages",
+      )
+      .option(
+        "--no-follow-redirects",
+        "Disable following HTTP redirects (default: follow redirects)",
+      )
       .action(async (library, url, options) => {
         // Update action parameters
         const result = await tools.scrape.execute({
@@ -65,6 +82,8 @@ async function main() {
             maxDepth: Number.parseInt(options.maxDepth),
             maxConcurrency: Number.parseInt(options.maxConcurrency),
             ignoreErrors: options.ignoreErrors,
+            scope: options.scope,
+            followRedirects: options.followRedirects, // This will be `true` by default, or `false` if --no-follow-redirects is used
           },
           // CLI always waits for completion (default behavior)
         });
 
@@ -66,7 +66,7 @@ export async function startServer() {
 
     // --- Tool Definitions ---
 
-    // Scrape docs tool (Keep as is for now, but likely needs ScrapeTool refactor)
+    // Scrape docs tool
     server.tool(
       "scrape_docs",
       "Scrape and index documentation from a URL",
@@ -80,17 +80,21 @@ export async function startServer() {
           .default(100)
           .describe("Maximum number of pages to scrape"),
         maxDepth: z.number().optional().default(3).describe("Maximum navigation depth"),
-        subpagesOnly: z
+        scope: z
+          .enum(["subpages", "hostname", "domain"])
+          .optional()
+          .default("subpages")
+          .describe("Defines the crawling boundary: 'subpages', 'hostname', or 'domain'"),
+        followRedirects: z
           .boolean()
           .optional()
           .default(true)
-          .describe("Only scrape pages under the initial URL path"),
+          .describe("Whether to follow HTTP redirects (3xx responses)"),
       },
       // Remove context as it's not used without progress reporting
-      async ({ url, library, version, maxPages, maxDepth, subpagesOnly }) => {
+      async ({ url, library, version, maxPages, maxDepth, scope, followRedirects }) => {
         try {
           // Execute scrape tool without waiting and without progress callback
-          // NOTE: This might fail if ScrapeTool relies on docService.getPipelineManager()
           const result = await tools.scrape.execute({
             url,
             library,
@@ -100,7 +104,8 @@ export async function startServer() {
             options: {
               maxPages,
               maxDepth,
-              subpagesOnly,
+              scope,
+              followRedirects,
             },
           });
 
 
@@ -1,6 +1,6 @@
 import axios from "axios";
 import { afterAll, beforeEach, describe, expect, it, vi } from "vitest";
-import { ScraperError } from "../../utils/errors";
+import { RedirectError, ScraperError } from "../../utils/errors";
 import { HttpFetcher } from "./HttpFetcher";
 
 vi.mock("axios");
@@ -104,6 +104,88 @@ describe("HttpFetcher", () => {
       responseType: "arraybuffer",
       headers,
       timeout: undefined,
+      maxRedirects: 5, // Default follows redirects
+    });
+  });
+
+  describe("redirect handling", () => {
+    it("should follow redirects by default", async () => {
+      const fetcher = new HttpFetcher();
+      const mockResponse = {
+        data: "<html><body><h1>Hello</h1></body></html>",
+        headers: { "content-type": "text/html" },
+      };
+      mockedAxios.get.mockResolvedValue(mockResponse);
+
+      await fetcher.fetch("https://example.com");
+      expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
+        responseType: "arraybuffer",
+        headers: undefined,
+        timeout: undefined,
+        maxRedirects: 5, // Default follows redirects
+        signal: undefined,
+      });
+    });
+
+    it("should follow redirects when followRedirects is true", async () => {
+      const fetcher = new HttpFetcher();
+      const mockResponse = {
+        data: "<html><body><h1>Hello</h1></body></html>",
+        headers: { "content-type": "text/html" },
+      };
+      mockedAxios.get.mockResolvedValue(mockResponse);
+
+      await fetcher.fetch("https://example.com", { followRedirects: true });
+      expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
+        responseType: "arraybuffer",
+        headers: undefined,
+        timeout: undefined,
+        maxRedirects: 5,
+        signal: undefined,
+      });
+    });
+
+    it("should not follow redirects when followRedirects is false", async () => {
+      const fetcher = new HttpFetcher();
+      const mockResponse = {
+        data: "<html><body><h1>Hello</h1></body></html>",
+        headers: { "content-type": "text/html" },
+      };
+      mockedAxios.get.mockResolvedValue(mockResponse);
+
+      await fetcher.fetch("https://example.com", { followRedirects: false });
+      expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
+        responseType: "arraybuffer",
+        headers: undefined,
+        timeout: undefined,
+        maxRedirects: 0, // No redirects allowed
+        signal: undefined,
+      });
+    });
+
+    it("should throw RedirectError when a redirect is encountered and followRedirects is false", async () => {
+      const fetcher = new HttpFetcher();
+      const redirectError = {
+        response: {
+          status: 301,
+          headers: {
+            location: "https://new-example.com",
+          },
+        },
+      };
+      mockedAxios.get.mockRejectedValue(redirectError);
+
+      await expect(
+        fetcher.fetch("https://example.com", { followRedirects: false }),
+      ).rejects.toBeInstanceOf(RedirectError);
+
+      await expect(
+        fetcher.fetch("https://example.com", { followRedirects: false }),
+      ).rejects.toMatchObject({
+        originalUrl: "https://example.com",
+        redirectUrl: "https://new-example.com",
+        statusCode: 301,
+      });
     });
   });
 });
@@ -1,5 +1,5 @@
-import axios, { type AxiosError } from "axios";
-import { ScraperError } from "../../utils/errors";
+import axios, { type AxiosError, type AxiosRequestConfig } from "axios";
+import { RedirectError, ScraperError } from "../../utils/errors";
 import { logger } from "../../utils/logger";
 import type { ContentFetcher, FetchOptions, RawContent } from "./types";
 
@@ -21,15 +21,21 @@ export class HttpFetcher implements ContentFetcher {
   async fetch(source: string, options?: FetchOptions): Promise<RawContent> {
     const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
     const baseDelay = options?.retryDelay ?? this.BASE_DELAY;
+    // Default to following redirects if not specified
+    const followRedirects = options?.followRedirects ?? true;
 
     for (let attempt = 0; attempt <= maxRetries; attempt++) {
       try {
-        const response = await axios.get(source, {
+        const config: AxiosRequestConfig = {
           responseType: "arraybuffer", // For handling both text and binary
           headers: options?.headers,
           timeout: options?.timeout,
           signal: options?.signal, // Pass signal to axios
-        });
+          // Axios follows redirects by default, we need to explicitly disable it if needed
+          maxRedirects: followRedirects ? 5 : 0,
+        };
+
+        const response = await axios.get(source, config);
 
         return {
           content: response.data,
@@ -42,6 +48,14 @@ export class HttpFetcher implements ContentFetcher {
         const status = axiosError.response?.status;
         const code = axiosError.code;
 
+        // Handle redirect errors (status codes 301, 302, 303, 307, 308)
+        if (!followRedirects && status && status >= 300 && status < 400) {
+          const location = axiosError.response?.headers?.location;
+          if (location) {
+            throw new RedirectError(source, location, status);
+          }
+        }
+
         if (
           attempt < maxRetries &&
           (status === undefined || (status >= 500 && status < 600))
 
@@ -27,6 +27,8 @@ export interface FetchOptions {
   timeout?: number;
   /** AbortSignal for cancellation */
   signal?: AbortSignal;
+  /** Whether to follow HTTP redirects (3xx responses) */
+  followRedirects?: boolean;
 }
 
 /**
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@ export interface FetchOptions {`
`27`	`27`	`timeout?: number;`
`28`	`28`	`/** AbortSignal for cancellation */`
`29`	`29`	`signal?: AbortSignal;`
	`30`	`+ /** Whether to follow HTTP redirects (3xx responses) */`
	`31`	`+ followRedirects?: boolean;`
`30`	`32`	`}`
`31`	`33`
`32`	`34`	`/**`