Skip to content

Commit 45d0e93

Browse files
committed
feat(scraper): enhance crawler controls with scope and redirect options
This commit introduces more powerful and flexible control over the web scraper's crawling behavior. It closes #15: 1. Added a new 'scope' option with three levels of crawling boundaries: - 'subpages': Only crawl URLs on the same hostname and path (default) - 'hostname': Crawl any URL on the same hostname regardless of path - 'domain': Crawl across subdomains of the same top-level domain 2. Added a 'followRedirects' option to control HTTP redirect handling: - When true: Redirects are followed automatically (default) - When false: Redirects trigger a RedirectError and are skipped 3. Removed the deprecated 'subpagesOnly' option, replacing it with the more flexible 'scope' option for better crawling control 4. Added CLI options to expose these new features: - '--scope <scope>' to control crawling boundaries - '--no-follow-redirects' to disable following HTTP redirects These changes give users more precise control over what gets crawled, helping with cases like: - Only crawling documentation pages in a specific section of a site - Crawling across different sections of the same site - Following documentation across subdomains (api.example.com, docs.example.com) - Preventing redirects to external sites The implementation leverages the Public Suffix List (psl) for proper domain boundary detection and adds comprehensive tests for all new functionality.
1 parent 5602894 commit 45d0e93

File tree

15 files changed

+570
-43
lines changed

15 files changed

+570
-43
lines changed

package-lock.json

Lines changed: 11 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"jsdom": "^26.0.0",
4949
"langchain": "0.3.19",
5050
"pg": "^8.14.0",
51+
"psl": "^1.15.0",
5152
"remark": "^15.0.1",
5253
"remark-gfm": "^4.0.1",
5354
"remark-html": "^16.0.1",
@@ -70,6 +71,7 @@
7071
"@types/node": "^20.17.23",
7172
"@types/node-fetch": "^2.6.12",
7273
"@types/pg": "~8.11.11",
74+
"@types/psl": "^1.1.3",
7375
"@types/semver": "^7.5.8",
7476
"@types/turndown": "^5.0.5",
7577
"drizzle-kit": "^0.30.5",

src/cli.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,23 @@ async function main() {
5454
.option("-d, --max-depth <number>", "Maximum navigation depth", "3")
5555
.option("-c, --max-concurrency <number>", "Maximum concurrent page requests", "3")
5656
.option("--ignore-errors", "Ignore errors during scraping", true)
57+
.option(
58+
"--scope <scope>",
59+
"Crawling boundary: 'subpages' (default), 'hostname', or 'domain'",
60+
(value) => {
61+
const validScopes = ["subpages", "hostname", "domain"];
62+
if (!validScopes.includes(value)) {
63+
console.warn(`Warning: Invalid scope '${value}'. Using default 'subpages'.`);
64+
return "subpages";
65+
}
66+
return value;
67+
},
68+
"subpages",
69+
)
70+
.option(
71+
"--no-follow-redirects",
72+
"Disable following HTTP redirects (default: follow redirects)",
73+
)
5774
.action(async (library, url, options) => {
5875
// Update action parameters
5976
const result = await tools.scrape.execute({
@@ -65,6 +82,8 @@ async function main() {
6582
maxDepth: Number.parseInt(options.maxDepth),
6683
maxConcurrency: Number.parseInt(options.maxConcurrency),
6784
ignoreErrors: options.ignoreErrors,
85+
scope: options.scope,
86+
followRedirects: options.followRedirects, // This will be `true` by default, or `false` if --no-follow-redirects is used
6887
},
6988
// CLI always waits for completion (default behavior)
7089
});

src/mcp/index.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ export async function startServer() {
6666

6767
// --- Tool Definitions ---
6868

69-
// Scrape docs tool (Keep as is for now, but likely needs ScrapeTool refactor)
69+
// Scrape docs tool
7070
server.tool(
7171
"scrape_docs",
7272
"Scrape and index documentation from a URL",
@@ -80,17 +80,21 @@ export async function startServer() {
8080
.default(100)
8181
.describe("Maximum number of pages to scrape"),
8282
maxDepth: z.number().optional().default(3).describe("Maximum navigation depth"),
83-
subpagesOnly: z
83+
scope: z
84+
.enum(["subpages", "hostname", "domain"])
85+
.optional()
86+
.default("subpages")
87+
.describe("Defines the crawling boundary: 'subpages', 'hostname', or 'domain'"),
88+
followRedirects: z
8489
.boolean()
8590
.optional()
8691
.default(true)
87-
.describe("Only scrape pages under the initial URL path"),
92+
.describe("Whether to follow HTTP redirects (3xx responses)"),
8893
},
8994
// Remove context as it's not used without progress reporting
90-
async ({ url, library, version, maxPages, maxDepth, subpagesOnly }) => {
95+
async ({ url, library, version, maxPages, maxDepth, scope, followRedirects }) => {
9196
try {
9297
// Execute scrape tool without waiting and without progress callback
93-
// NOTE: This might fail if ScrapeTool relies on docService.getPipelineManager()
9498
const result = await tools.scrape.execute({
9599
url,
96100
library,
@@ -100,7 +104,8 @@ export async function startServer() {
100104
options: {
101105
maxPages,
102106
maxDepth,
103-
subpagesOnly,
107+
scope,
108+
followRedirects,
104109
},
105110
});
106111

src/scraper/fetcher/HttpFetcher.test.ts

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import axios from "axios";
22
import { afterAll, beforeEach, describe, expect, it, vi } from "vitest";
3-
import { ScraperError } from "../../utils/errors";
3+
import { RedirectError, ScraperError } from "../../utils/errors";
44
import { HttpFetcher } from "./HttpFetcher";
55

66
vi.mock("axios");
@@ -104,6 +104,88 @@ describe("HttpFetcher", () => {
104104
responseType: "arraybuffer",
105105
headers,
106106
timeout: undefined,
107+
maxRedirects: 5, // Default follows redirects
108+
});
109+
});
110+
111+
describe("redirect handling", () => {
112+
it("should follow redirects by default", async () => {
113+
const fetcher = new HttpFetcher();
114+
const mockResponse = {
115+
data: "<html><body><h1>Hello</h1></body></html>",
116+
headers: { "content-type": "text/html" },
117+
};
118+
mockedAxios.get.mockResolvedValue(mockResponse);
119+
120+
await fetcher.fetch("https://example.com");
121+
expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
122+
responseType: "arraybuffer",
123+
headers: undefined,
124+
timeout: undefined,
125+
maxRedirects: 5, // Default follows redirects
126+
signal: undefined,
127+
});
128+
});
129+
130+
it("should follow redirects when followRedirects is true", async () => {
131+
const fetcher = new HttpFetcher();
132+
const mockResponse = {
133+
data: "<html><body><h1>Hello</h1></body></html>",
134+
headers: { "content-type": "text/html" },
135+
};
136+
mockedAxios.get.mockResolvedValue(mockResponse);
137+
138+
await fetcher.fetch("https://example.com", { followRedirects: true });
139+
expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
140+
responseType: "arraybuffer",
141+
headers: undefined,
142+
timeout: undefined,
143+
maxRedirects: 5,
144+
signal: undefined,
145+
});
146+
});
147+
148+
it("should not follow redirects when followRedirects is false", async () => {
149+
const fetcher = new HttpFetcher();
150+
const mockResponse = {
151+
data: "<html><body><h1>Hello</h1></body></html>",
152+
headers: { "content-type": "text/html" },
153+
};
154+
mockedAxios.get.mockResolvedValue(mockResponse);
155+
156+
await fetcher.fetch("https://example.com", { followRedirects: false });
157+
expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", {
158+
responseType: "arraybuffer",
159+
headers: undefined,
160+
timeout: undefined,
161+
maxRedirects: 0, // No redirects allowed
162+
signal: undefined,
163+
});
164+
});
165+
166+
it("should throw RedirectError when a redirect is encountered and followRedirects is false", async () => {
167+
const fetcher = new HttpFetcher();
168+
const redirectError = {
169+
response: {
170+
status: 301,
171+
headers: {
172+
location: "https://new-example.com",
173+
},
174+
},
175+
};
176+
mockedAxios.get.mockRejectedValue(redirectError);
177+
178+
await expect(
179+
fetcher.fetch("https://example.com", { followRedirects: false }),
180+
).rejects.toBeInstanceOf(RedirectError);
181+
182+
await expect(
183+
fetcher.fetch("https://example.com", { followRedirects: false }),
184+
).rejects.toMatchObject({
185+
originalUrl: "https://example.com",
186+
redirectUrl: "https://new-example.com",
187+
statusCode: 301,
188+
});
107189
});
108190
});
109191
});

src/scraper/fetcher/HttpFetcher.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import axios, { type AxiosError } from "axios";
2-
import { ScraperError } from "../../utils/errors";
1+
import axios, { type AxiosError, type AxiosRequestConfig } from "axios";
2+
import { RedirectError, ScraperError } from "../../utils/errors";
33
import { logger } from "../../utils/logger";
44
import type { ContentFetcher, FetchOptions, RawContent } from "./types";
55

@@ -21,15 +21,21 @@ export class HttpFetcher implements ContentFetcher {
2121
async fetch(source: string, options?: FetchOptions): Promise<RawContent> {
2222
const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
2323
const baseDelay = options?.retryDelay ?? this.BASE_DELAY;
24+
// Default to following redirects if not specified
25+
const followRedirects = options?.followRedirects ?? true;
2426

2527
for (let attempt = 0; attempt <= maxRetries; attempt++) {
2628
try {
27-
const response = await axios.get(source, {
29+
const config: AxiosRequestConfig = {
2830
responseType: "arraybuffer", // For handling both text and binary
2931
headers: options?.headers,
3032
timeout: options?.timeout,
3133
signal: options?.signal, // Pass signal to axios
32-
});
34+
// Axios follows redirects by default, we need to explicitly disable it if needed
35+
maxRedirects: followRedirects ? 5 : 0,
36+
};
37+
38+
const response = await axios.get(source, config);
3339

3440
return {
3541
content: response.data,
@@ -42,6 +48,14 @@ export class HttpFetcher implements ContentFetcher {
4248
const status = axiosError.response?.status;
4349
const code = axiosError.code;
4450

51+
// Handle redirect errors (status codes 301, 302, 303, 307, 308)
52+
if (!followRedirects && status && status >= 300 && status < 400) {
53+
const location = axiosError.response?.headers?.location;
54+
if (location) {
55+
throw new RedirectError(source, location, status);
56+
}
57+
}
58+
4559
if (
4660
attempt < maxRetries &&
4761
(status === undefined || (status >= 500 && status < 600))

src/scraper/fetcher/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ export interface FetchOptions {
2727
timeout?: number;
2828
/** AbortSignal for cancellation */
2929
signal?: AbortSignal;
30+
/** Whether to follow HTTP redirects (3xx responses) */
31+
followRedirects?: boolean;
3032
}
3133

3234
/**

0 commit comments

Comments
 (0)