Skip to content

Commit da16170

Browse files
committed
refactor: improve type organization and method signatures
BREAKING CHANGE: DocumentStore and VectorStoreService method signatures have changed - Reorganize types across domains: * Move domain-specific types closer to their implementations * Keep only shared types in src/types/index.ts * Add domain prefixes to type names for clarity - Standardize method signatures: * Replace filter objects with explicit library/version parameters * Make parameter order consistent across all methods * Update all tests to match new signatures - Improve type naming: * Rename DocContent -> Document * Rename PageResult -> ScrapedPage * Rename ScrapeOptions -> ScraperOptions * Rename ScrapingProgress -> ScraperProgress * Rename SearchResult -> StoreSearchResult * Rename VersionInfo -> LibraryVersion * Rename SplitterOptions -> MarkdownSplitterOptions The changes improve code organization, make dependencies clearer, and provide a more consistent and explicit API across the codebase.
1 parent f6c3baa commit da16170

20 files changed

+242
-207
lines changed

src/pipeline/DocumentProcessingPipeline.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { DocumentProcessingPipeline } from "./DocumentProcessingPipeline";
22
import { ScraperRegistry, ScraperService } from "../scraper";
33
import { VectorStoreService } from "../store";
44
import { logger } from "../utils/logger";
5-
import type { ScrapingProgress, ScrapeOptions } from "../types";
5+
import type { ScraperProgress, ScraperOptions } from "../scraper/types";
66
import { describe, it, expect, beforeEach, vi } from "vitest";
77
import { DocumentProcessingError, PipelineStateError } from "./errors";
88

@@ -38,7 +38,7 @@ vi.mock("../utils/logger", () => ({
3838
describe("DocumentProcessingPipeline", () => {
3939
let pipeline: DocumentProcessingPipeline;
4040

41-
const mockScrapeOptions: ScrapeOptions = {
41+
const mockScrapeOptions: ScraperOptions = {
4242
url: "https://example.com",
4343
library: "test-lib",
4444
version: "1.0.0",
@@ -47,7 +47,7 @@ describe("DocumentProcessingPipeline", () => {
4747
subpagesOnly: true,
4848
};
4949

50-
const mockProgress: ScrapingProgress = {
50+
const mockProgress: ScraperProgress = {
5151
currentUrl: "https://example.com/page",
5252
pagesScraped: 1,
5353
maxPages: 10,
@@ -164,7 +164,7 @@ describe("DocumentProcessingPipeline", () => {
164164
});
165165

166166
it("should handle stop during processing", async () => {
167-
let progressCallback: ((progress: ScrapingProgress) => void) | undefined;
167+
let progressCallback: ((progress: ScraperProgress) => void) | undefined;
168168

169169
mockScrape.mockImplementation((_, callback) => {
170170
progressCallback = callback;

src/pipeline/DocumentProcessingPipeline.ts

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
import { ScraperRegistry, ScraperService } from "../scraper";
22
import type { VectorStoreService } from "../store";
3-
import type {
4-
DocumentPipeline,
5-
DocumentPipelineCallbacks,
6-
ScrapeOptions,
7-
ScrapingProgress,
8-
} from "../types";
3+
import type { DocumentPipeline, DocumentPipelineCallbacks } from "./types";
4+
import type { ScraperOptions, ScraperProgress } from "../scraper/types";
95
import { logger } from "../utils/logger";
106
import { DocumentProcessingError, PipelineStateError } from "./errors";
117

@@ -44,14 +40,14 @@ export class DocumentProcessingPipeline implements DocumentPipeline {
4440
* Initiates document processing pipeline.
4541
* Coordinates scraping and storage operations with progress tracking
4642
*/
47-
async process(options: ScrapeOptions): Promise<void> {
43+
async process(options: ScraperOptions): Promise<void> {
4844
if (this.isProcessing) {
4945
throw new PipelineStateError("Pipeline is already processing");
5046
}
5147

5248
this.isProcessing = true;
5349
try {
54-
await this.scraperService.scrape(options, (progress: ScrapingProgress) =>
50+
await this.scraperService.scrape(options, (progress: ScraperProgress) =>
5551
this.handleScrapingProgress(progress)
5652
);
5753
logger.info("✅ Pipeline processing complete");
@@ -72,7 +68,7 @@ export class DocumentProcessingPipeline implements DocumentPipeline {
7268
}
7369

7470
private async handleScrapingProgress(
75-
progress: ScrapingProgress
71+
progress: ScraperProgress
7672
): Promise<void> {
7773
if (!this.isProcessing) return;
7874

src/pipeline/types.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import type { Document } from "../types";
2+
import type { ScraperOptions, ScraperProgress } from "../scraper/types";
3+
4+
/**
5+
* Interface for document processing pipeline implementations
6+
*/
7+
export interface DocumentPipeline {
8+
process(options: ScraperOptions): Promise<void>;
9+
setCallbacks(callbacks: DocumentPipelineCallbacks): void;
10+
stop(): Promise<void>;
11+
}
12+
13+
/**
14+
* Callbacks for pipeline progress and error handling
15+
*/
16+
export interface DocumentPipelineCallbacks {
17+
onProgress?: (progress: ScraperProgress) => Promise<void>;
18+
onError?: (error: Error, document?: Document) => Promise<void>;
19+
}

src/scraper/HtmlScraper.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { validateUrl } from "../utils/url";
55
import { logger } from "../utils/logger";
66
import createDOMPurify, { type WindowLike } from "dompurify";
77
import { Window } from "happy-dom";
8-
import type { PageResult } from "../types";
8+
import type { ScrapedPage } from "./types";
99

1010
export type RetryOptions = {
1111
maxRetries?: number;
@@ -162,7 +162,7 @@ export class HtmlScraper {
162162
* Performs single attempt at scraping page content and converting to markdown.
163163
* Handles HTML sanitization and URL normalization
164164
*/
165-
public async scrapePage(url: string): Promise<PageResult> {
165+
public async scrapePage(url: string): Promise<ScrapedPage> {
166166
validateUrl(url);
167167

168168
const { data } = await scrapeIt<{
@@ -238,7 +238,7 @@ export class HtmlScraper {
238238
public async scrapePageWithRetry(
239239
url: string,
240240
options?: RetryOptions
241-
): Promise<PageResult> {
241+
): Promise<ScrapedPage> {
242242
validateRetryOptions(options);
243243
const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
244244
const baseDelay = options?.baseDelay ?? this.BASE_DELAY;

src/scraper/ScraperRegistry.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { ScraperStrategy } from "../types";
1+
import type { ScraperStrategy } from "./types";
22
import { validateUrl } from "../utils/url";
33
import { DefaultScraperStrategy } from "./strategies/DefaultScraperStrategy";
44
import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy";

src/scraper/ScraperService.ts

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,36 @@
1-
import type {
2-
ScrapeOptions,
3-
ProgressCallback,
4-
ScrapingProgress,
5-
} from "../types";
6-
import { ScraperError } from "../utils/errors";
7-
import { logger } from "../utils/logger";
81
import type { ScraperRegistry } from "./ScraperRegistry";
2+
import type { ScraperOptions, ScraperProgress } from "./types";
3+
import { ScraperError } from "../utils/errors";
4+
import type { ProgressCallback } from "../types";
95

106
/**
11-
* Orchestrates web scraping operations using registered scraping strategies.
12-
* Delegates scraping to appropriate strategies based on URL patterns and provides
13-
* a unified error handling layer, wrapping domain-specific errors into ScraperErrors
14-
* for consistent error management throughout the application.
7+
* Orchestrates document scraping operations using registered scraping strategies.
8+
* Automatically selects appropriate strategy based on URL patterns.
159
*/
1610
export class ScraperService {
17-
constructor(private registry: ScraperRegistry) {}
11+
private registry: ScraperRegistry;
12+
13+
constructor(registry: ScraperRegistry) {
14+
this.registry = registry;
15+
}
1816

1917
/**
20-
* Executes scraping using appropriate strategy for given URL.
21-
* Provides progress tracking and error handling wrapper
18+
* Scrapes content from the provided URL using the appropriate strategy.
19+
* Reports progress via callback and handles errors.
2220
*/
2321
async scrape(
24-
options: ScrapeOptions,
25-
progressCallback?: ProgressCallback<ScrapingProgress>
22+
options: ScraperOptions,
23+
progressCallback?: ProgressCallback<ScraperProgress>
2624
): Promise<void> {
27-
try {
28-
const strategy = this.registry.getStrategy(options.url);
29-
await strategy.scrape(options, progressCallback);
30-
} catch (error) {
31-
logger.error(
32-
`❌ Scraping failed: ${error instanceof Error ? error.message : "Unknown error"}`
25+
// Find strategy for this URL
26+
const strategy = this.registry.getStrategy(options.url);
27+
if (!strategy) {
28+
throw new ScraperError(
29+
`No scraper strategy found for URL: ${options.url}`,
30+
false
3331
);
34-
// Wrap non-ScraperError errors in ScraperError
35-
if (!(error instanceof ScraperError)) {
36-
throw new ScraperError(
37-
`Failed to scrape ${options.url}: ${error instanceof Error ? error.message : "Unknown error"}`,
38-
false,
39-
error as Error
40-
);
41-
}
42-
throw error;
4332
}
33+
34+
await strategy.scrape(options, progressCallback);
4435
}
4536
}

0 commit comments

Comments
 (0)