Skip to content

Commit 88b7e7a

Browse files
committed
feat(scraper): enable external script fetching in sandbox
Introduces a fetchScriptContent callback to executeJsInSandbox, allowing the HtmlJsExecutorMiddleware to securely fetch external scripts using the context's fetcher before executing them in the Node.js vm sandbox. Updates relevant types, strategies, and tests.
1 parent 19dea10 commit 88b7e7a

File tree

7 files changed

+560
-50
lines changed

7 files changed

+560
-50
lines changed

src/scraper/middleware/components/HtmlJsExecutorMiddleware.test.ts

Lines changed: 147 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
import type { DOMWindow } from "jsdom";
2-
import { type Mock, beforeEach, describe, expect, it, vi } from "vitest";
2+
import {
3+
type Mock,
4+
type MockedObject,
5+
beforeEach,
6+
describe,
7+
expect,
8+
it,
9+
vi,
10+
} from "vitest";
11+
import type { ContentFetcher, RawContent } from "../../fetcher/types";
312
import { executeJsInSandbox } from "../../utils/sandbox";
4-
import type { SandboxExecutionResult } from "../../utils/sandbox";
13+
import type {
14+
SandboxExecutionOptions,
15+
SandboxExecutionResult,
16+
} from "../../utils/sandbox";
517
import type { ContentProcessingContext } from "../types";
618
import { HtmlJsExecutorMiddleware } from "./HtmlJsExecutorMiddleware";
719

@@ -15,10 +27,17 @@ describe("HtmlJsExecutorMiddleware", () => {
1527
let mockContext: ContentProcessingContext;
1628
let mockNext: Mock;
1729
let mockSandboxResult: SandboxExecutionResult;
30+
let mockFetcher: MockedObject<ContentFetcher>;
1831

1932
beforeEach(() => {
2033
vi.resetAllMocks();
2134

35+
// Create a mock fetcher
36+
mockFetcher = vi.mocked<ContentFetcher>({
37+
canFetch: vi.fn(),
38+
fetch: vi.fn(),
39+
});
40+
2241
mockContext = {
2342
source: "http://example.com",
2443
content: "", // Will be set in tests
@@ -31,8 +50,10 @@ describe("HtmlJsExecutorMiddleware", () => {
3150
url: "http://example.com", // Can reuse context.source
3251
library: "test-lib",
3352
version: "1.0.0",
53+
signal: undefined, // Initialize signal
3454
// Add other optional ScraperOptions properties if needed for specific tests
3555
},
56+
fetcher: mockFetcher,
3657
// dom property might be added by the middleware
3758
};
3859
mockNext = vi.fn().mockResolvedValue(undefined);
@@ -53,10 +74,14 @@ describe("HtmlJsExecutorMiddleware", () => {
5374
await middleware.process(mockContext, mockNext);
5475

5576
expect(executeJsInSandbox).toHaveBeenCalledOnce();
56-
expect(executeJsInSandbox).toHaveBeenCalledWith({
57-
html: "<p>Initial</p><script></script>",
58-
url: "http://example.com",
59-
});
77+
// Verify fetchScriptContent is passed as a function
78+
expect(executeJsInSandbox).toHaveBeenCalledWith(
79+
expect.objectContaining({
80+
html: "<p>Initial</p><script></script>",
81+
url: "http://example.com",
82+
fetchScriptContent: expect.any(Function),
83+
}),
84+
);
6085
});
6186

6287
it("should update context.content with finalHtml from sandbox result", async () => {
@@ -130,10 +155,14 @@ describe("HtmlJsExecutorMiddleware", () => {
130155
await middleware.process(mockContext, mockNext);
131156

132157
expect(executeJsInSandbox).toHaveBeenCalledOnce();
133-
expect(executeJsInSandbox).toHaveBeenCalledWith({
134-
html: initialHtml,
135-
url: "http://example.com",
136-
});
158+
// Updated assertion to expect fetchScriptContent
159+
expect(executeJsInSandbox).toHaveBeenCalledWith(
160+
expect.objectContaining({
161+
html: initialHtml,
162+
url: "http://example.com",
163+
fetchScriptContent: expect.any(Function),
164+
}),
165+
);
137166
expect(mockNext).toHaveBeenCalledOnce();
138167
});
139168

@@ -152,4 +181,112 @@ describe("HtmlJsExecutorMiddleware", () => {
152181
);
153182
expect(mockNext).not.toHaveBeenCalled(); // Should not proceed if middleware itself fails
154183
});
184+
185+
// --- Tests for fetchScriptContent callback logic ---
186+
187+
it("fetchScriptContent callback should use context.fetcher to fetch script", async () => {
188+
mockContext.content = "<p>Initial</p><script src='ext.js'></script>";
189+
const middleware = new HtmlJsExecutorMiddleware();
190+
const mockScriptContent = "console.log('fetched');";
191+
const mockRawContent: RawContent = {
192+
content: Buffer.from(mockScriptContent),
193+
mimeType: "application/javascript",
194+
source: "http://example.com/ext.js",
195+
};
196+
mockFetcher.fetch.mockResolvedValue(mockRawContent);
197+
198+
await middleware.process(mockContext, mockNext);
199+
200+
// Get the options passed to the sandbox mock
201+
const sandboxOptions = (executeJsInSandbox as Mock).mock
202+
.calls[0][0] as SandboxExecutionOptions;
203+
expect(sandboxOptions.fetchScriptContent).toBeDefined();
204+
205+
// Invoke the callback to test its behavior
206+
const fetchedContent = await sandboxOptions.fetchScriptContent!(
207+
"http://example.com/ext.js",
208+
);
209+
210+
expect(mockFetcher.fetch).toHaveBeenCalledWith("http://example.com/ext.js", {
211+
signal: undefined,
212+
followRedirects: true,
213+
});
214+
expect(fetchedContent).toBe(mockScriptContent);
215+
expect(mockContext.errors).toHaveLength(0); // No errors expected during fetch
216+
});
217+
218+
it("fetchScriptContent callback should handle fetcher errors", async () => {
219+
mockContext.content = "<p>Initial</p><script src='ext.js'></script>";
220+
const middleware = new HtmlJsExecutorMiddleware();
221+
const fetchError = new Error("Network Failed");
222+
mockFetcher.fetch.mockRejectedValue(fetchError);
223+
224+
await middleware.process(mockContext, mockNext);
225+
226+
const sandboxOptions = (executeJsInSandbox as Mock).mock
227+
.calls[0][0] as SandboxExecutionOptions;
228+
const fetchedContent = await sandboxOptions.fetchScriptContent!(
229+
"http://example.com/ext.js",
230+
);
231+
232+
expect(mockFetcher.fetch).toHaveBeenCalledWith("http://example.com/ext.js", {
233+
signal: undefined,
234+
followRedirects: true,
235+
});
236+
expect(fetchedContent).toBeNull();
237+
expect(mockContext.errors).toHaveLength(1);
238+
expect(mockContext.errors[0].message).toContain(
239+
"Failed to fetch external script http://example.com/ext.js: Network Failed",
240+
);
241+
expect(mockContext.errors[0].cause).toBe(fetchError);
242+
});
243+
244+
it("fetchScriptContent callback should handle non-JS MIME types", async () => {
245+
mockContext.content = "<p>Initial</p><script src='style.css'></script>";
246+
const middleware = new HtmlJsExecutorMiddleware();
247+
const mockRawContent: RawContent = {
248+
content: "body { color: red; }",
249+
mimeType: "text/css", // Incorrect MIME type
250+
source: "http://example.com/style.css",
251+
};
252+
mockFetcher.fetch.mockResolvedValue(mockRawContent);
253+
254+
await middleware.process(mockContext, mockNext);
255+
256+
const sandboxOptions = (executeJsInSandbox as Mock).mock
257+
.calls[0][0] as SandboxExecutionOptions;
258+
const fetchedContent = await sandboxOptions.fetchScriptContent!(
259+
"http://example.com/style.css",
260+
);
261+
262+
expect(mockFetcher.fetch).toHaveBeenCalledWith("http://example.com/style.css", {
263+
signal: undefined,
264+
followRedirects: true,
265+
});
266+
expect(fetchedContent).toBeNull();
267+
expect(mockContext.errors).toHaveLength(1);
268+
expect(mockContext.errors[0].message).toContain(
269+
"Skipping execution of external script http://example.com/style.css due to unexpected MIME type: text/css",
270+
);
271+
});
272+
273+
it("fetchScriptContent callback should handle missing fetcher in context", async () => {
274+
mockContext.content = "<p>Initial</p><script src='ext.js'></script>";
275+
mockContext.fetcher = undefined; // Remove fetcher for this test
276+
const middleware = new HtmlJsExecutorMiddleware();
277+
278+
await middleware.process(mockContext, mockNext);
279+
280+
const sandboxOptions = (executeJsInSandbox as Mock).mock
281+
.calls[0][0] as SandboxExecutionOptions;
282+
const fetchedContent = await sandboxOptions.fetchScriptContent!(
283+
"http://example.com/ext.js",
284+
);
285+
286+
expect(mockFetcher.fetch).not.toHaveBeenCalled(); // Fetcher should not be called
287+
expect(fetchedContent).toBeNull();
288+
expect(mockContext.errors).toHaveLength(0); // Only logs a warning, doesn't add error
289+
// We can't easily verify logger.warn was called without mocking it again here,
290+
// but the null return and lack of fetch call imply the check worked.
291+
});
155292
});

src/scraper/middleware/components/HtmlJsExecutorMiddleware.ts

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { logger } from "../../../utils/logger";
2-
import { executeJsInSandbox } from "../../utils/sandbox"; // Updated import path
2+
import type { FetchOptions, RawContent } from "../../fetcher/types";
3+
import { executeJsInSandbox } from "../../utils/sandbox";
34
import type { ContentProcessingContext, ContentProcessorMiddleware } from "../types";
45

56
/**
@@ -32,10 +33,97 @@ export class HtmlJsExecutorMiddleware implements ContentProcessorMiddleware {
3233
`Executing JavaScript in sandbox for HTML content from ${context.source}`,
3334
);
3435

36+
// Define the callback for fetching external scripts
37+
const fetchScriptContentCallback = async (
38+
scriptUrl: string,
39+
): Promise<string | null> => {
40+
if (!context.fetcher) {
41+
logger.warn(
42+
`No fetcher available in context to fetch external script: ${scriptUrl}`,
43+
);
44+
return null;
45+
}
46+
try {
47+
logger.debug(`Fetching external script via context fetcher: ${scriptUrl}`);
48+
// Pass relevant options, especially the signal for cancellation
49+
const fetchOptions: FetchOptions = {
50+
signal: context.options?.signal, // Pass signal from context if available
51+
followRedirects: true, // Generally want to follow redirects for scripts
52+
// timeout: context.options?.fetchTimeout // Add if timeout is configurable at context level
53+
};
54+
const rawContent: RawContent = await context.fetcher.fetch(
55+
scriptUrl,
56+
fetchOptions,
57+
);
58+
59+
// Optional: Check MIME type to be reasonably sure it's JavaScript
60+
const allowedMimeTypes = [
61+
"application/javascript",
62+
"text/javascript",
63+
"application/x-javascript",
64+
];
65+
// Allow common JS types or be lenient if type is generic/unknown
66+
const mimeTypeLower = rawContent.mimeType.toLowerCase();
67+
if (
68+
!allowedMimeTypes.includes(mimeTypeLower) &&
69+
!["application/octet-stream", "unknown/unknown", ""].includes(mimeTypeLower) // Allow empty MIME type as well
70+
) {
71+
logger.warn(
72+
`Skipping execution of external script ${scriptUrl} due to unexpected MIME type: ${rawContent.mimeType}`,
73+
);
74+
context.errors.push(
75+
new Error(
76+
`Skipping execution of external script ${scriptUrl} due to unexpected MIME type: ${rawContent.mimeType}`,
77+
),
78+
);
79+
return null;
80+
}
81+
82+
// Convert content to string using provided encoding or default to utf-8
83+
const contentBuffer = Buffer.isBuffer(rawContent.content)
84+
? rawContent.content
85+
: Buffer.from(rawContent.content);
86+
87+
// Validate encoding before using it
88+
const validEncodings: BufferEncoding[] = [
89+
"ascii",
90+
"utf8",
91+
"utf-8",
92+
"utf16le",
93+
"ucs2",
94+
"ucs-2",
95+
"base64",
96+
"base64url",
97+
"latin1",
98+
"binary",
99+
"hex",
100+
];
101+
const encoding =
102+
rawContent.encoding &&
103+
validEncodings.includes(rawContent.encoding.toLowerCase() as BufferEncoding)
104+
? (rawContent.encoding.toLowerCase() as BufferEncoding)
105+
: "utf-8";
106+
107+
return contentBuffer.toString(encoding);
108+
} catch (fetchError) {
109+
// fetcher.fetch is expected to throw on error (e.g., 404, network error)
110+
const message =
111+
fetchError instanceof Error ? fetchError.message : String(fetchError);
112+
logger.warn(`Failed to fetch external script ${scriptUrl}: ${message}`); // Use warn for fetch failures like 404
113+
context.errors.push(
114+
new Error(`Failed to fetch external script ${scriptUrl}: ${message}`, {
115+
cause: fetchError,
116+
}),
117+
);
118+
return null; // Indicate failure to the sandbox runner
119+
}
120+
};
121+
35122
// TODO: Plumb timeout options from context.options if available
36123
const sandboxOptions = {
37124
html: initialHtml,
38125
url: context.source,
126+
fetchScriptContent: fetchScriptContentCallback, // Pass the callback
39127
// timeout: context.options?.scriptTimeout // Example for future enhancement
40128
};
41129

src/scraper/middleware/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import type { DOMWindow } from "jsdom";
2+
import type { ContentFetcher } from "../fetcher/types";
23
import type { ScraperOptions } from "../types";
34

45
/**
@@ -22,6 +23,9 @@ export interface ContentProcessingContext {
2223

2324
/** Optional JSDOM window object for HTML processing. */
2425
dom?: DOMWindow;
26+
27+
/** Optional fetcher instance for resolving resources relative to the source. */
28+
fetcher?: ContentFetcher;
2529
}
2630

2731
/**

src/scraper/strategies/WebScraperStrategy.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ export class WebScraperStrategy extends BaseScraperStrategy {
9191
metadata: {},
9292
links: [],
9393
errors: [],
94-
options: options, // Pass the full options object
94+
options,
95+
fetcher: this.httpFetcher,
9596
};
9697

9798
let pipeline: ContentProcessingPipeline;

src/scraper/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ export interface ScraperOptions {
3939
ignoreErrors?: boolean;
4040
/** CSS selectors for elements to exclude during HTML processing */
4141
excludeSelectors?: string[];
42+
/** Optional AbortSignal for cancellation */
43+
signal?: AbortSignal;
4244
}
4345

4446
/**

0 commit comments

Comments
 (0)