Skip to content

Commit ee3118f

Browse files
committed
feat(scraper): add HtmlPlaywrightMiddleware for dynamic content rendering
Implements `HtmlPlaywrightMiddleware` as an alternative to `HtmlDomParserMiddleware`. This middleware uses Playwright (Chromium) to load initial HTML, execute JavaScript, and block unnecessary resources (CSS, images). It updates `context.content` with the fully rendered HTML and then parses this HTML using JSDOM to populate `context.dom`, ensuring compatibility with subsequent middleware expecting a JSDOM object. Includes: - Playwright dependency and `postinstall` script for browser download. - Basic unit tests for the new middleware. Refs #19
1 parent db4df20 commit ee3118f

File tree

5 files changed

+417
-10
lines changed

5 files changed

+417
-10
lines changed

package-lock.json

Lines changed: 41 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
"lint": "biome check .",
3333
"format": "biome format . --write",
3434
"db:generate": "drizzle-kit generate",
35-
"db:push": "drizzle-kit push"
35+
"db:push": "drizzle-kit push",
36+
"postinstall": "npx playwright install --with-deps chromium"
3637
},
3738
"dependencies": {
3839
"@joplin/turndown-plugin-gfm": "^1.0.61",
@@ -54,6 +55,7 @@
5455
"jsdom": "^26.0.0",
5556
"langchain": "0.3.19",
5657
"pg": "^8.14.0",
58+
"playwright": "^1.52.0",
5759
"psl": "^1.15.0",
5860
"remark": "^15.0.1",
5961
"remark-gfm": "^4.0.1",
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
import { type MockedObject, afterAll, afterEach, describe, expect, it, vi } from "vitest";
2+
import type { ScraperOptions } from "../../types";
3+
import type { ContentProcessingContext } from "../types"; // Adjusted path
4+
import { HtmlPlaywrightMiddleware } from "./HtmlPlaywrightMiddleware";
5+
6+
// Suppress logger output during tests
7+
vi.mock("../../../utils/logger");
8+
9+
// Mock playwright and jsdom using factory functions
10+
vi.mock("playwright", async (importOriginal) =>
11+
importOriginal<typeof import("playwright")>(),
12+
);
13+
14+
// Mock playwright and jsdom using factory functions
15+
vi.mock("jsdom", async (importOriginal) => {
16+
const actual = await importOriginal<typeof import("jsdom")>();
17+
return { ...actual };
18+
});
19+
20+
import { type Browser, type Page, chromium } from "playwright";
21+
22+
// Helper to create a minimal valid ScraperOptions object
23+
const createMockScraperOptions = (
24+
url = "http://example.com",
25+
excludeSelectors?: string[],
26+
): ScraperOptions => ({
27+
url,
28+
library: "test-lib",
29+
version: "1.0.0",
30+
maxDepth: 0,
31+
maxPages: 1,
32+
maxConcurrency: 1,
33+
scope: "subpages",
34+
followRedirects: true,
35+
excludeSelectors: excludeSelectors || [],
36+
ignoreErrors: false,
37+
});
38+
39+
// Helper to create a basic context for pipeline tests
40+
const createPipelineTestContext = (
41+
content: string | Buffer,
42+
contentType: string,
43+
source = "http://example.com",
44+
options?: Partial<ScraperOptions>,
45+
): ContentProcessingContext => {
46+
const fullOptions = { ...createMockScraperOptions(source), ...options };
47+
const context: ContentProcessingContext = {
48+
content,
49+
contentType,
50+
source,
51+
metadata: {},
52+
links: [],
53+
errors: [],
54+
options: fullOptions,
55+
// dom is added by the parser middleware or playwright middleware
56+
};
57+
return context;
58+
};
59+
60+
// --- Tests for HtmlPlaywrightMiddleware ---
61+
// Note: These tests require Playwright and a browser (Chromium) to be installed.
62+
describe("HtmlPlaywrightMiddleware", () => {
63+
// Use a shared instance for tests to avoid launching browser repeatedly
64+
const playwrightMiddleware = new HtmlPlaywrightMiddleware();
65+
66+
afterEach(() => {
67+
// Reset the browser instance after each test
68+
// This ensures a clean state for each test
69+
// @ts-ignore
70+
playwrightMiddleware.browser?.close();
71+
// @ts-ignore
72+
playwrightMiddleware.browser = null;
73+
});
74+
75+
// Ensure browser is closed after all tests in this suite
76+
afterAll(async () => {
77+
await playwrightMiddleware.closeBrowser();
78+
});
79+
80+
it("should render simple HTML and update context.content and context.dom", async () => {
81+
const initialHtml =
82+
"<html><head><title>Initial</title></head><body><p>Hello</p><script>document.querySelector('p').textContent = 'Hello Playwright!';</script></body></html>";
83+
const context = createPipelineTestContext(
84+
initialHtml,
85+
"text/html",
86+
// Using a unique domain helps isolate Playwright's network interception
87+
"https://f8b6e5ad-46ca-5934-bf4d-0409f8375e9a.com/test",
88+
); // Set a source URL for the context
89+
90+
// Create a pipeline with only the Playwright middleware for this test
91+
// We need to pass the context through the middleware directly, not a pipeline
92+
const next = vi.fn(); // Mock the next function
93+
await playwrightMiddleware.process(context, next);
94+
95+
expect(context.errors).toHaveLength(0);
96+
// Check if content was updated by Playwright rendering the script's effect
97+
expect(context.content).toContain("<p>Hello Playwright!</p>");
98+
// Check if JSDOM parsing succeeded and populated the dom property
99+
expect(context.dom).toBeDefined();
100+
expect(context.dom?.window).toBeDefined();
101+
expect(context.dom?.window.document.querySelector("p")?.textContent).toBe(
102+
"Hello Playwright!",
103+
);
104+
// Title should still be extractable by JSDOM from the rendered content
105+
expect(context.dom?.window.document.title).toBe("Initial");
106+
// Ensure next was called if processing was successful
107+
expect(next).toHaveBeenCalled();
108+
});
109+
110+
it("should handle invalid HTML without throwing unhandled errors and call next", async () => {
111+
const invalidHtml = "<html><body><p>Mismatched tag</div></html>";
112+
const context = createPipelineTestContext(
113+
invalidHtml,
114+
"text/html",
115+
// Using a unique domain helps isolate Playwright's network interception
116+
"https://f8b6e5ad-46ca-5934-bf4d-0409f8375e9a.com/test-invalid",
117+
);
118+
const next = vi.fn(); // Mock the next function
119+
await playwrightMiddleware.process(context, next);
120+
121+
// Playwright/Browser might tolerate some errors, JSDOM might too.
122+
// We expect the middleware to complete, potentially with errors in the context.
123+
// We primarily check that *our* middleware code doesn't crash and calls next.
124+
expect(context.errors.length).toBeGreaterThanOrEqual(0); // Allow for parsing errors from JSDOM or warnings
125+
// Check if some content processing still happened
126+
expect(context.dom).toBeDefined(); // JSDOM likely still parsed something
127+
// Ensure next was called even if there were parsing errors
128+
expect(next).toHaveBeenCalled();
129+
});
130+
131+
it("should skip processing for non-HTML content and call next", async () => {
132+
const markdown = "# Hello";
133+
const context = createPipelineTestContext(markdown, "text/markdown");
134+
const initialContent = context.content;
135+
const next = vi.fn();
136+
await playwrightMiddleware.process(context, next);
137+
138+
expect(context.content).toBe(initialContent); // Content should not change
139+
expect(context.dom).toBeUndefined(); // DOM should not be set
140+
expect(context.errors).toHaveLength(0);
141+
expect(next).toHaveBeenCalled(); // Next should always be called
142+
});
143+
144+
it("should add error to context if Playwright page.goto fails and call next", async () => {
145+
const html = "<html><body>Good</body></html>";
146+
const context = createPipelineTestContext(
147+
html,
148+
"text/html",
149+
"https://f8b6e5ad-46ca-5934-bf4d-0409f8375e9a.com/goto-fail",
150+
);
151+
const next = vi.fn();
152+
153+
// Spy on page.goto and make it throw
154+
const pageSpy = {
155+
route: vi.fn().mockResolvedValue(undefined),
156+
goto: vi.fn().mockRejectedValue(new Error("Simulated navigation failure")),
157+
content: vi.fn(), // Doesn't matter as goto fails
158+
close: vi.fn().mockResolvedValue(undefined),
159+
} as MockedObject<Page>;
160+
const browserSpy = {
161+
newPage: vi.fn().mockResolvedValue(pageSpy),
162+
isConnected: vi.fn().mockReturnValue(true),
163+
on: vi.fn(),
164+
close: vi.fn().mockResolvedValue(undefined),
165+
} as MockedObject<Browser>;
166+
167+
// Intercept launch to control the page object
168+
const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy);
169+
170+
await playwrightMiddleware.process(context, next);
171+
172+
expect(context.errors.length).toBeGreaterThan(0);
173+
expect(context.errors[0].message).toContain("Simulated navigation failure");
174+
expect(context.dom).toBeUndefined(); // DOM should not be set
175+
expect(next).toHaveBeenCalled(); // Next should still be called
176+
177+
launchSpy.mockRestore(); // Restore the launch spy
178+
});
179+
180+
it("should add error to context if JSDOM parsing fails after Playwright and not call next", async () => {
181+
const html = "<html><body><p>Rendered</p></body></html>";
182+
const renderedHtml = "<html><body>Rendered by Playwright</body></html>";
183+
const context = createPipelineTestContext(
184+
html,
185+
"text/html",
186+
"https://f8b6e5ad-46ca-5934-bf4d-0409f8375e9a.com/jsdom-fail",
187+
);
188+
const next = vi.fn();
189+
190+
// Spy on page.content to return successfully, but spy on JSDOM to fail
191+
const pageSpy = {
192+
route: vi.fn().mockResolvedValue(undefined),
193+
goto: vi.fn().mockResolvedValue(null),
194+
content: vi.fn().mockResolvedValue(renderedHtml),
195+
close: vi.fn().mockResolvedValue(undefined),
196+
} as MockedObject<Page>;
197+
const browserSpy = {
198+
newPage: vi.fn().mockImplementation(() => {
199+
console.log("Mocked newPage called");
200+
return pageSpy;
201+
}),
202+
// newPage: vi.fn().mockResolvedValue(pageSpy),
203+
isConnected: vi.fn().mockReturnValue(true),
204+
on: vi.fn(),
205+
close: vi.fn().mockResolvedValue(undefined),
206+
} as MockedObject<Browser>;
207+
208+
// const launchSpy = vi.spyOn(chromium, "launch").mockResolvedValue(browserSpy); // Use unknown cast
209+
const launchSpy = vi.spyOn(chromium, "launch").mockImplementation(() => {
210+
console.log("Mocked launch called");
211+
return Promise.resolve(browserSpy as unknown as Browser);
212+
});
213+
214+
// Spy on JSDOM constructor to throw
215+
const jsdomSpy = vi.spyOn(await import("jsdom"), "JSDOM").mockImplementation(() => {
216+
throw new Error("Simulated JSDOM parsing error");
217+
});
218+
219+
await playwrightMiddleware.process(context, next);
220+
221+
expect(context.errors.length).toBeGreaterThan(0);
222+
expect(context.errors[0].message).toContain("Simulated JSDOM parsing error");
223+
expect(context.dom).toBeUndefined(); // DOM should not be set
224+
expect(next).not.toHaveBeenCalled(); // Next should NOT be called
225+
226+
// Restore spies
227+
launchSpy.mockRestore();
228+
jsdomSpy.mockRestore();
229+
});
230+
});

0 commit comments

Comments
 (0)