Skip to content

Commit d3aa894

Browse files
committed
feat: enhance web scraping and error handling
- Add error handling to BaseScraperStrategy - Improve HTML processing with additional selectors and link extraction - Update CLI with ignoreErrors option - Add tests for BaseScraperStrategy
1 parent bd83392 commit d3aa894

File tree

10 files changed

+406
-131
lines changed

10 files changed

+406
-131
lines changed

src/cli.ts

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@ import os from "node:os";
44
import path from "node:path";
55
import { Command } from "commander";
66
import { VectorStoreService } from "./store/VectorStoreService";
7-
import { FindVersionTool, ListLibrariesTool, ScrapeTool, SearchTool } from "./tools";
7+
import {
8+
FindVersionTool,
9+
ListLibrariesTool,
10+
ScrapeTool,
11+
SearchTool,
12+
} from "./tools";
813

914
const formatOutput = (data: unknown) => JSON.stringify(data, null, 2);
1015

@@ -39,12 +44,12 @@ async function main() {
3944
.description("Scrape and index documentation from a URL")
4045
.option("-p, --max-pages <number>", "Maximum pages to scrape", "100")
4146
.option("-d, --max-depth <number>", "Maximum navigation depth", "3")
42-
.option("-c, --max-concurrency <number>", "Maximum concurrent page requests", "3")
4347
.option(
44-
"--subpages-only",
45-
"Allow scraping pages outside the initial URL path",
46-
true,
48+
"-c, --max-concurrency <number>",
49+
"Maximum concurrent page requests",
50+
"3"
4751
)
52+
.option("--ignore-errors", "Ignore errors during scraping", true)
4853
.action(async (library, version, url, options) => {
4954
const result = await tools.scrape.execute({
5055
url,
@@ -54,6 +59,7 @@ async function main() {
5459
maxPages: Number.parseInt(options.maxPages),
5560
maxDepth: Number.parseInt(options.maxDepth),
5661
maxConcurrency: Number.parseInt(options.maxConcurrency),
62+
ignoreErrors: options.ignoreErrors,
5763
},
5864
});
5965
console.log(`✅ Successfully scraped ${result.pagesScraped} pages`);
@@ -66,13 +72,13 @@ async function main() {
6672
" - search react 18.0.0 'hooks' -> matches docs for React 18.0.0 or earlier versions\n" +
6773
" - search react 18.0.0 'hooks' --exact-match -> only matches React 18.0.0\n" +
6874
" - search typescript 5.x 'types' -> matches any TypeScript 5.x.x version\n" +
69-
" - search typescript 5.2.x 'types' -> matches any TypeScript 5.2.x version",
75+
" - search typescript 5.2.x 'types' -> matches any TypeScript 5.2.x version"
7076
)
7177
.option("-l, --limit <number>", "Maximum number of results", "5")
7278
.option(
7379
"-e, --exact-match",
7480
"Only use exact version match (e.g., '18.0.0' matches only 18.0.0, not 17.x.x) (default: false)",
75-
false,
81+
false
7682
)
7783
.action(async (library, version, query, options) => {
7884
const result = await tools.search.execute({
@@ -109,7 +115,10 @@ async function main() {
109115

110116
await program.parseAsync();
111117
} catch (error) {
112-
console.error("Error:", error instanceof Error ? error.message : String(error));
118+
console.error(
119+
"Error:",
120+
error instanceof Error ? error.message : String(error)
121+
);
113122
await storeService.shutdown();
114123
process.exit(1);
115124
}

src/scraper/ScraperService.test.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@ describe("ScraperService", () => {
2727
maxPages: 10,
2828
maxDepth: 1,
2929
};
30+
const progressCallback: ProgressCallback<ScraperProgress> = vi.fn();
3031

3132
mockRegistry.getStrategy.mockReturnValue(mockStrategy);
32-
await service.scrape(options);
33+
await service.scrape(options, progressCallback);
3334

3435
expect(mockRegistry.getStrategy).toHaveBeenCalledWith(options.url);
35-
expect(mockStrategy.scrape).toHaveBeenCalledWith(options, undefined);
36+
expect(mockStrategy.scrape).toHaveBeenCalledWith(options, progressCallback);
3637
});
3738

3839
it("should pass progress callback to strategy", async () => {
@@ -65,12 +66,13 @@ describe("ScraperService", () => {
6566
maxPages: 10,
6667
maxDepth: 1,
6768
};
69+
const progressCallback: ProgressCallback<ScraperProgress> = vi.fn();
6870

6971
mockRegistry.getStrategy.mockReturnValue(mockStrategy);
70-
await service.scrape(options);
72+
await service.scrape(options, progressCallback);
7173

7274
expect(mockRegistry.getStrategy).toHaveBeenCalledWith(options.url);
73-
expect(mockStrategy.scrape).toHaveBeenCalledWith(options, undefined);
75+
expect(mockStrategy.scrape).toHaveBeenCalledWith(options, progressCallback);
7476
});
7577

7678
it("should throw error if no strategy found", async () => {
@@ -84,11 +86,14 @@ describe("ScraperService", () => {
8486
maxPages: 10,
8587
maxDepth: 1,
8688
};
89+
const progressCallback: ProgressCallback<ScraperProgress> = vi.fn();
8790

8891
mockRegistry.getStrategy.mockReturnValue(null);
8992

90-
await expect(service.scrape(options)).rejects.toThrow(ScraperError);
91-
await expect(service.scrape(options)).rejects.toThrow(
93+
await expect(service.scrape(options, progressCallback)).rejects.toThrow(
94+
ScraperError
95+
);
96+
await expect(service.scrape(options, progressCallback)).rejects.toThrow(
9297
"No scraper strategy found for URL: unknown://example.com"
9398
);
9499
});
@@ -104,10 +109,13 @@ describe("ScraperService", () => {
104109
maxPages: 10,
105110
maxDepth: 1,
106111
};
112+
const progressCallback: ProgressCallback<ScraperProgress> = vi.fn();
107113

108114
mockRegistry.getStrategy.mockReturnValue(mockStrategy);
109115
mockStrategy.scrape.mockRejectedValue(new Error("Strategy error"));
110116

111-
await expect(service.scrape(options)).rejects.toThrow("Strategy error");
117+
await expect(service.scrape(options, progressCallback)).rejects.toThrow(
118+
"Strategy error"
119+
);
112120
});
113121
});

src/scraper/ScraperService.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@ export class ScraperService {
2020
*/
2121
async scrape(
2222
options: ScraperOptions,
23-
progressCallback?: ProgressCallback<ScraperProgress>,
23+
progressCallback: ProgressCallback<ScraperProgress>
2424
): Promise<void> {
2525
// Find strategy for this URL
2626
const strategy = this.registry.getStrategy(options.url);
2727
if (!strategy) {
28-
throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
28+
throw new ScraperError(
29+
`No scraper strategy found for URL: ${options.url}`,
30+
false
31+
);
2932
}
3033

3134
await strategy.scrape(options, progressCallback);

src/scraper/processor/HtmlProcessor.test.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,103 @@ describe("HtmlProcessor", () => {
8383
const result = await processor.process(rawContent);
8484
expect(result.links).toEqual([]);
8585
});
86+
87+
it("should extract links from nav sidebar before removing tags", async () => {
88+
const processor = new HtmlProcessor();
89+
const rawContent: RawContent = {
90+
content:
91+
'<html><head><title>Test</title></head><body><nav><ul><li><a href="/home">Home</a></li><li><a href="/about">About</a></li></ul></nav><p>Other content</p></body></html>',
92+
mimeType: "text/html",
93+
source: "https://example.com",
94+
};
95+
const result = await processor.process(rawContent);
96+
expect(result.links).toEqual([
97+
"https://example.com/home",
98+
"https://example.com/about",
99+
]);
100+
});
101+
102+
it("should remove unwanted tags and keep text from allowed tags", async () => {
103+
const processor = new HtmlProcessor();
104+
const rawContent: RawContent = {
105+
content:
106+
"<html><head><title>Test</title></head><body><nav><ul><li><a href=\"/home\">Home</a></li></ul></nav><p>This text should remain.</p><script>alert('This should be removed');</script></body></html>",
107+
mimeType: "text/html",
108+
source: "https://example.com",
109+
};
110+
const result = await processor.process(rawContent);
111+
expect(result.content).toContain("This text should remain.");
112+
expect(result.content).not.toContain("Home");
113+
expect(result.content).not.toContain("This should be removed");
114+
});
115+
116+
describe("Code block language detection", () => {
117+
const processor = new HtmlProcessor();
118+
119+
it("should detect language from highlight-source-<language> on a parent", async () => {
120+
const rawContent: RawContent = {
121+
content:
122+
'<html><head><title>Test</title></head><body><div class="highlight-source-python"><pre><code>print("Hello")</code></pre></div></body></html>',
123+
mimeType: "text/html",
124+
source: "https://example.com",
125+
};
126+
const result = await processor.process(rawContent);
127+
expect(result.content).toContain("```python");
128+
});
129+
130+
it("should detect language from highlight-<language> on a parent", async () => {
131+
const rawContent: RawContent = {
132+
content:
133+
'<html><head><title>Test</title></head><body><div class="highlight-javascript"><pre><code>console.log("Hello")</code></pre></div></body></html>',
134+
mimeType: "text/html",
135+
source: "https://example.com",
136+
};
137+
const result = await processor.process(rawContent);
138+
expect(result.content).toContain("```javascript");
139+
});
140+
141+
it("should detect language from language-<language> on a parent", async () => {
142+
const rawContent: RawContent = {
143+
content:
144+
'<html><head><title>Test</title></head><body><div class="language-typescript"><pre><code>console.log("Hello")</code></pre></div></body></html>',
145+
mimeType: "text/html",
146+
source: "https://example.com",
147+
};
148+
const result = await processor.process(rawContent);
149+
expect(result.content).toContain("```typescript");
150+
});
151+
152+
it("should detect language from language-<language> on the pre tag itself", async () => {
153+
const rawContent: RawContent = {
154+
content:
155+
'<html><head><title>Test</title></head><body><pre class="language-java"><code>System.out.println("Hello")</code></pre></body></html>',
156+
mimeType: "text/html",
157+
source: "https://example.com",
158+
};
159+
const result = await processor.process(rawContent);
160+
expect(result.content).toContain("```java");
161+
});
162+
163+
it("should default to empty language if no language class is present", async () => {
164+
const rawContent: RawContent = {
165+
content:
166+
'<html><head><title>Test</title></head><body><pre><code>print("Hello")</code></pre></body></html>',
167+
mimeType: "text/html",
168+
source: "https://example.com",
169+
};
170+
const result = await processor.process(rawContent);
171+
expect(result.content).toContain("```\n");
172+
});
173+
174+
it("should prioritize data-language attribute", async () => {
175+
const rawContent: RawContent = {
176+
content:
177+
'<html><head><title>Test</title></head><body><div class="highlight-source-python"><pre data-language="typescript"><code>print("Hello")</code></pre></div></body></html>',
178+
mimeType: "text/html",
179+
source: "https://example.com",
180+
};
181+
const result = await processor.process(rawContent);
182+
expect(result.content).toContain("```typescript");
183+
});
184+
});
86185
});

0 commit comments

Comments
 (0)