Skip to content

Commit 1dc2a11

Browse files
committed
feat(scraper): implement configurable subpage scraping behavior
Added explicit support for controlling subpage scraping behavior through `subpagesOnly` option. Previously this was hardcoded to true, now it can be configured: - Added `subpagesOnly` to ScrapeTool options and pipeline configuration - Enhanced WebScraperStrategy to properly handle the subpagesOnly setting - Added comprehensive tests for both subpagesOnly=true/false scenarios - Added cross-origin link filtering for better security - Fixed error log sequencing in DocumentManagementService This change enables more flexibility in documentation scraping while maintaining security through origin checks.
1 parent e22b124 commit 1dc2a11

File tree

6 files changed

+212
-26
lines changed

6 files changed

+212
-26
lines changed

src/mcp/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ export async function startServer() {
6060
.default(true)
6161
.describe("Only scrape pages under the initial URL path"),
6262
},
63-
async ({ url, library, version, maxPages, maxDepth }) => {
63+
async ({ url, library, version, maxPages, maxDepth, subpagesOnly }) => {
6464
try {
6565
const result = await tools.scrape.execute({
6666
url,
@@ -69,6 +69,7 @@ export async function startServer() {
6969
options: {
7070
maxPages,
7171
maxDepth,
72+
subpagesOnly,
7273
},
7374
});
7475

src/scraper/strategies/WebScraperStrategy.test.ts

Lines changed: 152 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,27 @@
1-
import { beforeEach, describe, expect, it, vi } from "vitest";
1+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
22
import { HttpFetcher } from "../fetcher";
3+
import { HtmlProcessor, type ProcessedContent } from "../processor";
34
import type { ScraperOptions } from "../types";
45
import { WebScraperStrategy } from "./WebScraperStrategy";
6+
7+
// Mock the processor module
8+
vi.mock("../processor");
9+
510
describe("WebScraperStrategy", () => {
611
let options: ScraperOptions;
12+
let defaultProcessResult: ProcessedContent;
713

814
beforeEach(() => {
15+
// Provide a default valid result for the processor mock
16+
defaultProcessResult = {
17+
content: "Default mock content",
18+
title: "Default Title",
19+
source: "mock-source",
20+
links: [],
21+
metadata: {},
22+
};
23+
vi.spyOn(HtmlProcessor.prototype, "process").mockResolvedValue(defaultProcessResult);
24+
925
options = {
1026
url: "https://example.com",
1127
library: "test",
@@ -68,4 +84,139 @@ describe("WebScraperStrategy", () => {
6884

6985
expect(HttpFetcher.prototype.fetch).toHaveBeenCalledTimes(1);
7086
});
87+
88+
// Restore mocks after each test
89+
afterEach(() => {
90+
// Restore mocks *before* the next test runs its beforeEach
91+
vi.restoreAllMocks();
92+
});
93+
94+
it("should only follow subpage links when subpagesOnly is true (default)", async () => {
95+
const strategy = new WebScraperStrategy();
96+
const options: ScraperOptions = {
97+
url: "https://example.com/docs/", // Base path with trailing slash
98+
library: "test",
99+
version: "1.0",
100+
maxPages: 5, // Allow multiple pages
101+
maxDepth: 2, // Allow following links
102+
subpagesOnly: true, // Explicitly true (also default)
103+
};
104+
const progressCallback = vi.fn();
105+
106+
const fetchSpy = vi
107+
.spyOn(HttpFetcher.prototype, "fetch")
108+
.mockImplementation(async (url) => ({
109+
// Simple fetch mock, processor will provide links
110+
content: `Content for ${url}`,
111+
mimeType: "text/html",
112+
source: url,
113+
}));
114+
115+
// Mock HtmlProcessor to return specific links for the root URL
116+
const processSpy = vi
117+
.spyOn(HtmlProcessor.prototype, "process")
118+
.mockImplementation(async (rawContent) => {
119+
if (rawContent.source === "https://example.com/docs/") {
120+
return {
121+
content: "Processed content",
122+
title: "Docs Index",
123+
source: rawContent.source,
124+
links: [
125+
"https://example.com/docs/page1", // Subpage
126+
"https://example.com/other/page2", // Outside /docs/
127+
"https://example.com/docs/page3/", // Subpage with slash
128+
"https://anothersite.com/", // Cross-origin link
129+
"/docs/relative", // Relative subpage
130+
"/other/relative", // Relative outside
131+
],
132+
metadata: {},
133+
};
134+
}
135+
// Return no links for subsequent pages
136+
return {
137+
content: "Processed subpage",
138+
title: "Subpage",
139+
source: rawContent.source,
140+
links: [],
141+
metadata: {},
142+
};
143+
});
144+
145+
await strategy.scrape(options, progressCallback);
146+
147+
// Should fetch: root + page1 + page3 + relative
148+
// Should fetch: root + page1 + page3 + relative (4 total)
149+
expect(fetchSpy).toHaveBeenCalledTimes(4);
150+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/");
151+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page1");
152+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page3/");
153+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/relative");
154+
expect(fetchSpy).not.toHaveBeenCalledWith("https://example.com/other/page2");
155+
expect(fetchSpy).not.toHaveBeenCalledWith("https://anothersite.com/");
156+
expect(fetchSpy).not.toHaveBeenCalledWith("https://example.com/other/relative");
157+
expect(processSpy).toHaveBeenCalledTimes(4); // Processor called for each fetched page
158+
});
159+
160+
it("should follow links outside base path when subpagesOnly is false", async () => {
161+
const strategy = new WebScraperStrategy();
162+
const options: ScraperOptions = {
163+
url: "https://example.com/docs/",
164+
library: "test",
165+
version: "1.0",
166+
maxPages: 5,
167+
maxDepth: 2,
168+
subpagesOnly: false, // Explicitly false
169+
};
170+
const progressCallback = vi.fn();
171+
172+
const fetchSpy = vi
173+
.spyOn(HttpFetcher.prototype, "fetch")
174+
.mockImplementation(async (url) => ({
175+
// Simple fetch mock
176+
content: `Content for ${url}`,
177+
mimeType: "text/html",
178+
source: url,
179+
}));
180+
181+
// Mock HtmlProcessor to return specific links for the root URL
182+
const processSpy = vi
183+
.spyOn(HtmlProcessor.prototype, "process")
184+
.mockImplementation(async (rawContent) => {
185+
if (rawContent.source === "https://example.com/docs/") {
186+
return {
187+
content: "Processed content",
188+
title: "Docs Index",
189+
source: rawContent.source,
190+
links: [
191+
"https://example.com/docs/page1", // Subpage
192+
"https://example.com/other/page2", // Outside /docs/
193+
"https://anothersite.com/", // Cross-origin link
194+
"/docs/relative", // Relative subpage
195+
"/other/relative", // Relative outside
196+
],
197+
metadata: {},
198+
};
199+
}
200+
// Return no links for subsequent pages
201+
return {
202+
content: "Processed subpage",
203+
title: "Subpage",
204+
source: rawContent.source,
205+
links: [],
206+
metadata: {},
207+
};
208+
});
209+
210+
await strategy.scrape(options, progressCallback);
211+
212+
// Should fetch: root + page1 + page2 + relative_sub + relative_other (5 total)
213+
expect(fetchSpy).toHaveBeenCalledTimes(5);
214+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/");
215+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/page1");
216+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/other/page2"); // Included now
217+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/docs/relative");
218+
expect(fetchSpy).toHaveBeenCalledWith("https://example.com/other/relative"); // Included now
219+
expect(fetchSpy).not.toHaveBeenCalledWith("https://anothersite.com/"); // Link removed from mock
220+
expect(processSpy).toHaveBeenCalledTimes(5); // Processor called for each fetched page
221+
});
71222
});

src/scraper/strategies/WebScraperStrategy.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import type { Document, ProgressCallback } from "../../types";
1+
import type { Document } from "../../types";
22
import { logger } from "../../utils/logger";
33
import type { UrlNormalizerOptions } from "../../utils/url";
44
import { HttpFetcher } from "../fetcher";
5-
import type { ScraperOptions, ScraperProgress } from "../types";
5+
import type { ScraperOptions } from "../types";
66
import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy";
77

88
export interface WebScraperStrategyOptions {
@@ -54,6 +54,11 @@ export class WebScraperStrategy extends BaseScraperStrategy {
5454
const links = result.links.filter((link) => {
5555
try {
5656
const targetUrl = new URL(link, baseUrl);
57+
// Always ensure the target is on the same origin
58+
if (targetUrl.origin !== baseUrl.origin) {
59+
return false;
60+
}
61+
// Apply subpagesOnly and custom filter logic
5762
return (
5863
(!options.subpagesOnly || this.isSubpage(baseUrl, targetUrl)) &&
5964
(!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl))

src/store/DocumentManagementService.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,12 @@ export class DocumentManagementService {
107107
); // listVersions already filters for semver
108108

109109
if (validSemverVersions.length === 0) {
110-
logger.warn(`⚠️ No valid semver versions found for ${library}`);
111110
if (hasUnversioned) {
112111
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
113112
return { bestMatch: null, hasUnversioned: true };
114113
}
115114
// Throw error only if NO versions (semver or unversioned) exist
115+
logger.warn(`⚠️ No valid versions found for ${library}`);
116116
throw new VersionNotFoundError(library, targetVersion ?? "", []);
117117
}
118118

src/tools/ScrapeTool.test.ts

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ describe("ScrapeTool", () => {
2222
// Mock implementation for pipeline callbacks
2323
let pipelineCallbacks: {
2424
onProgress?: (progress: ScraperProgress) => Promise<void>;
25-
onError?: (error: Error, doc?: any) => Promise<void>;
25+
onError?: (error: Error, doc?: unknown) => Promise<void>;
2626
} = {};
2727

2828
beforeEach(() => {
@@ -94,14 +94,14 @@ describe("ScrapeTool", () => {
9494
async (invalidVersion) => {
9595
const options = getBaseOptions(invalidVersion);
9696

97-
await expect(scrapeTool.execute(options)).rejects.toThrow(
98-
/Invalid version format for scraping/,
99-
);
100-
// Initialize IS called before the version check throws
101-
+ expect(mockDocService.initialize).toHaveBeenCalledOnce();
102-
expect(mockDocService.removeAllDocuments).not.toHaveBeenCalled();
103-
expect(mockPipelineInstance.process).not.toHaveBeenCalled();
104-
},
97+
await expect(scrapeTool.execute(options)).rejects.toThrow(
98+
/Invalid version format for scraping/,
99+
);
100+
// Initialize IS called before the version check throws
101+
+expect(mockDocService.initialize).toHaveBeenCalledOnce();
102+
expect(mockDocService.removeAllDocuments).not.toHaveBeenCalled();
103+
expect(mockPipelineInstance.process).not.toHaveBeenCalled();
104+
},
105105
);
106106

107107
// --- Pipeline Execution Tests ---
@@ -134,8 +134,20 @@ describe("ScrapeTool", () => {
134134
// Simulate progress callback updating pagesScraped
135135
(mockPipelineInstance.process as Mock).mockImplementation(async () => {
136136
if (pipelineCallbacks.onProgress) {
137-
await pipelineCallbacks.onProgress({ pagesScraped: 10, maxPages: 100, currentUrl: "url1", depth: 1, maxDepth: 3 });
138-
await pipelineCallbacks.onProgress({ pagesScraped: 25, maxPages: 100, currentUrl: "url2", depth: 2, maxDepth: 3 });
137+
await pipelineCallbacks.onProgress({
138+
pagesScraped: 10,
139+
maxPages: 100,
140+
currentUrl: "url1",
141+
depth: 1,
142+
maxDepth: 3,
143+
});
144+
await pipelineCallbacks.onProgress({
145+
pagesScraped: 25,
146+
maxPages: 100,
147+
currentUrl: "url2",
148+
depth: 2,
149+
maxDepth: 3,
150+
});
139151
}
140152
});
141153

@@ -159,7 +171,13 @@ describe("ScrapeTool", () => {
159171
(mockPipelineInstance.process as Mock).mockImplementation(async () => {
160172
// Simulate pipeline calling its progress callback
161173
if (pipelineCallbacks.onProgress) {
162-
await pipelineCallbacks.onProgress({ pagesScraped: 5, maxPages: 10, currentUrl: "http://page.com", depth: 1, maxDepth: 2 });
174+
await pipelineCallbacks.onProgress({
175+
pagesScraped: 5,
176+
maxPages: 10,
177+
currentUrl: "http://page.com",
178+
depth: 1,
179+
maxDepth: 2,
180+
});
163181
}
164182
});
165183

@@ -172,7 +190,7 @@ describe("ScrapeTool", () => {
172190
});
173191

174192
it("should call onProgress callback when pipeline reports an error", async () => {
175-
const options = getBaseOptions("1.0.0", mockOnProgress);
193+
const options = getBaseOptions("1.0.0", mockOnProgress);
176194
const docError = new Error("Failed to parse");
177195
(mockPipelineInstance.process as Mock).mockImplementation(async () => {
178196
// Simulate pipeline calling its error callback
@@ -184,24 +202,34 @@ describe("ScrapeTool", () => {
184202
await scrapeTool.execute(options);
185203

186204
expect(mockOnProgress).toHaveBeenCalledOnce();
187-
expect(mockOnProgress).toHaveBeenCalledWith({
188-
content: [{ type: "text", text: expect.stringContaining("Error processing Bad Doc: Failed to parse") }],
205+
expect(mockOnProgress).toHaveBeenCalledWith({
206+
content: [
207+
{
208+
type: "text",
209+
text: expect.stringContaining("Error processing Bad Doc: Failed to parse"),
210+
},
211+
],
189212
});
190213
});
191214

192-
it("should not fail if onProgress is not provided", async () => {
215+
it("should not fail if onProgress is not provided", async () => {
193216
const options = getBaseOptions("1.0.0"); // No onProgress callback
194-
(mockPipelineInstance.process as Mock).mockImplementation(async () => {
217+
(mockPipelineInstance.process as Mock).mockImplementation(async () => {
195218
if (pipelineCallbacks.onProgress) {
196-
await pipelineCallbacks.onProgress({ pagesScraped: 1, maxPages: 10, currentUrl: "url", depth: 0, maxDepth: 1 });
219+
await pipelineCallbacks.onProgress({
220+
pagesScraped: 1,
221+
maxPages: 10,
222+
currentUrl: "url",
223+
depth: 0,
224+
maxDepth: 1,
225+
});
197226
}
198-
if (pipelineCallbacks.onError) {
227+
if (pipelineCallbacks.onError) {
199228
await pipelineCallbacks.onError(new Error("Test Error"));
200229
}
201230
});
202231

203232
// Expect no error to be thrown during execution when callbacks fire internally
204233
await expect(scrapeTool.execute(options)).resolves.toBeDefined();
205234
});
206-
207235
});

src/tools/ScrapeTool.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export interface ScrapeToolOptions {
1313
options?: {
1414
maxPages?: number;
1515
maxDepth?: number;
16+
subpagesOnly?: boolean;
1617
maxConcurrency?: number;
1718
ignoreErrors?: boolean;
1819
};
@@ -115,7 +116,7 @@ export class ScrapeTool {
115116
url: url,
116117
library: library,
117118
version: internalVersion, // Pass the normalized internal version to the pipeline process
118-
subpagesOnly: true,
119+
subpagesOnly: scraperOptions?.subpagesOnly ?? true, // Use passed value or default
119120
maxPages: scraperOptions?.maxPages ?? 100,
120121
maxDepth: scraperOptions?.maxDepth ?? 3,
121122
maxConcurrency: scraperOptions?.maxConcurrency ?? 3,

0 commit comments

Comments
 (0)