Skip to content

Commit 0cebe97

Browse files
committed
fix: restore progress callbacks in scraper
- Add onProgress option to DocumentationScraperDispatcher constructor - Fix progress callback propagation through strategy chain - Update scrape tool to use correct metadata structure - Simplify progress callback passing in scrape tool
1 parent 48b35b7 commit 0cebe97

File tree

2 files changed

+16
-19
lines changed

2 files changed

+16
-19
lines changed

src/scraper/index.ts

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@ import { NpmScraperStrategy } from "./strategies/npm-strategy";
99
import { PyPiScraperStrategy } from "./strategies/pypi-strategy";
1010

1111
export class DocumentationScraperDispatcher {
12-
private determineStrategy(
13-
url: string,
14-
options?: { onProgress?: ScrapingProgressCallback }
15-
) {
12+
private readonly onProgress?: ScrapingProgressCallback;
13+
14+
constructor(options?: { onProgress?: ScrapingProgressCallback }) {
15+
this.onProgress = options?.onProgress;
16+
}
17+
18+
private determineStrategy(url: string) {
1619
// Validate URL before determining strategy
1720
validateUrl(url);
1821
const { hostname } = new URL(url);
@@ -23,16 +26,16 @@ export class DocumentationScraperDispatcher {
2326
hostname === "npmjs.com" ||
2427
hostname === "www.npmjs.com"
2528
) {
26-
return new NpmScraperStrategy(options);
29+
return new NpmScraperStrategy({ onProgress: this.onProgress });
2730
}
2831

2932
// PyPI domain
3033
if (hostname === "pypi.org" || hostname === "www.pypi.org") {
31-
return new PyPiScraperStrategy(options);
34+
return new PyPiScraperStrategy({ onProgress: this.onProgress });
3235
}
3336

3437
// Default strategy for all other domains
35-
return new DefaultScraperStrategy(options);
38+
return new DefaultScraperStrategy({ onProgress: this.onProgress });
3639
}
3740

3841
async scrape(
@@ -41,9 +44,7 @@ export class DocumentationScraperDispatcher {
4144
): Promise<DocContent[]> {
4245
// Validate config URL before proceeding
4346
validateUrl(config.url);
44-
const strategy = this.determineStrategy(config.url, {
45-
onProgress: progressCallback,
46-
});
47+
const strategy = this.determineStrategy(config.url);
4748
return strategy.scrape(config, progressCallback);
4849
}
4950
}

src/tools/scrape.ts

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { DocumentationScraper } from "../scraper/index.js";
1+
import { DocumentationScraperDispatcher } from "../scraper/index.js";
22
import type { VectorStoreManager } from "../store/index.js";
33
import type {
44
ScraperConfig,
@@ -35,12 +35,8 @@ export const scrape = async (options: ScrapeOptions): Promise<ScrapeResult> => {
3535
subpagesOnly,
3636
} = options;
3737

38-
const scraper = new DocumentationScraper({
39-
onProgress: (progress: ScrapingProgress) => {
40-
if (onProgress) {
41-
return onProgress(progress);
42-
}
43-
},
38+
const scraper = new DocumentationScraperDispatcher({
39+
onProgress,
4440
});
4541

4642
const config: ScraperConfig = {
@@ -61,8 +57,8 @@ export const scrape = async (options: ScrapeOptions): Promise<ScrapeResult> => {
6157
const doc = new Document({
6258
pageContent: result.content,
6359
metadata: {
64-
url: result.url,
65-
title: result.title,
60+
url: result.metadata.url,
61+
title: result.metadata.title,
6662
library,
6763
version,
6864
},

0 commit comments

Comments
 (0)