arabold
diff --git a/‎ARCHITECTURE.md‎
Lines changed: 64 additions & 12 deletions b/‎ARCHITECTURE.md‎
Lines changed: 64 additions & 12 deletions
diff --git a/‎biome.json‎
Lines changed: 13 additions & 1 deletion b/‎biome.json‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎package-lock.json‎
Lines changed: 9 additions & 2 deletions b/‎package-lock.json‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎package.json‎
Lines changed: 1 addition & 0 deletions b/‎package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/cli.ts‎
Lines changed: 1 addition & 6 deletions b/‎src/cli.ts‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎src/mcp/index.ts‎
Lines changed: 2 additions & 6 deletions b/‎src/mcp/index.ts‎
Lines changed: 2 additions & 6 deletions
@@ -29,7 +29,10 @@ src/
 │   │   └── LocalFileStrategy.ts     # Handles local filesystem content
 │   │   └── ...
 │   ├── fetcher/                     # Content fetching abstractions
-│   ├── processor/                   # Content processing abstractions
+│   ├── middleware/                  # Content processing pipeline & middleware
+│   │   ├── Pipeline.ts              # Orchestrates middleware execution
+│   │   ├── types.ts                 # Context and middleware interfaces
+│   │   └── components/              # Individual middleware implementations
 │   └── ...
 ├── splitter/                        # Document splitting and chunking
 ├── store/                           # Document storage and retrieval
@@ -56,20 +59,69 @@ Each source type has a dedicated strategy that understands its specific protocol
 
 ### Content Processing Flow
 
+Raw content fetched by a strategy's `fetcher` (e.g., HTML, Markdown) is processed through a configurable middleware pipeline. See the Middleware Pipeline section below for details.
+
 ```mermaid
-graph LR
-    S[Source URL] --> R[Registry]
-    R --> ST[Strategy Selection]
-    ST --> F[Fetch Content]
-    F --> P[Process Content]
-    P --> D[Document Creation]
-```
+graph TD
+    subgraph Strategy Execution
+        F[Fetcher Fetches RawContent]
+        CtxIn[Create Initial Context]
+        Pipe[Run Pipeline]
+        CtxOut[Get Final Context]
+        Doc[Create Document from Context]
+    end
 
-The registry automatically selects the appropriate strategy based on the URL scheme, ensuring:
+    subgraph ContentProcessingPipeline
+        direction LR
+        M1[Middleware 1] --> M2[Middleware 2] --> M3[...]
+    end
+
+    F --> CtxIn
+    CtxIn --> Pipe
+    Pipe -- Passes Context --> M1
+    M1 -- Passes Context --> M2
+    M2 -- Passes Context --> M3
+    M3 -- Returns Final Context --> CtxOut
+    CtxOut --> Doc
+```
 
-- Consistent handling across different content sources
-- Unified document format for storage
-- Reusable content processing logic
+- **`ContentProcessingContext`**: An object passed through the pipeline, carrying the content (initially raw, potentially transformed), MIME type, source URL, extracted metadata, links, errors, and options. HTML processing also uses a `dom` property on the context to hold the parsed JSDOM object.
+- **`ContentProcessorMiddleware`**: Individual, reusable components that perform specific tasks on the context, such as:
+  - Parsing HTML (`HtmlDomParserMiddleware`)
+  - Extracting metadata (`HtmlMetadataExtractorMiddleware`, `MarkdownMetadataExtractorMiddleware`)
+  - Extracting links (`HtmlLinkExtractorMiddleware`, `MarkdownLinkExtractorMiddleware`)
+  - Sanitizing and cleaning HTML (`HtmlSanitizerMiddleware`)
+  - Converting HTML to Markdown (`HtmlToMarkdownMiddleware`)
+- **`ContentProcessingPipeline`**: Executes a sequence of middleware components in order, passing the context object between them.
+- **Strategies (`WebScraperStrategy`, `LocalFileStrategy`, etc.)**: Construct and run the appropriate pipeline based on the fetched content's MIME type. After the pipeline completes, the strategy uses the final `content` and `metadata` from the context to create the `Document` object.
+
+This middleware approach ensures:
+
+- **Modularity:** Processing steps are isolated and reusable.
+- **Configurability:** Pipelines can be easily assembled for different content types.
+- **Testability:** Individual middleware components can be tested independently.
+- **Consistency:** Ensures a unified document format regardless of the source.
+
+### Middleware Pipeline
+
+The core of content processing is the middleware pipeline (`ContentProcessingPipeline` located in `src/scraper/middleware/`). This pattern allows for modular and reusable processing steps.
+
+- **`ContentProcessingContext`**: An object passed through the pipeline, carrying the content (initially raw, potentially transformed), MIME type, source URL, extracted metadata, links, errors, and options. HTML processing also uses a `dom` property on the context to hold the parsed JSDOM object.
+- **`ContentProcessorMiddleware`**: Individual, reusable components that perform specific tasks on the context, such as:
+  - Parsing HTML (`HtmlDomParserMiddleware`)
+  - Extracting metadata (`HtmlMetadataExtractorMiddleware`, `MarkdownMetadataExtractorMiddleware`)
+  - Extracting links (`HtmlLinkExtractorMiddleware`, `MarkdownLinkExtractorMiddleware`)
+  - Sanitizing and cleaning HTML (`HtmlSanitizerMiddleware`)
+  - Converting HTML to Markdown (`HtmlToMarkdownMiddleware`)
+- **`ContentProcessingPipeline`**: Executes a sequence of middleware components in order, passing the context object between them.
+- **Strategies (`WebScraperStrategy`, `LocalFileStrategy`, etc.)**: Construct and run the appropriate pipeline based on the fetched content's MIME type. After the pipeline completes, the strategy uses the final `content` and `metadata` from the context to create the `Document` object.
+
+This middleware approach ensures:
+
+- **Modularity:** Processing steps are isolated and reusable.
+- **Configurability:** Pipelines can be easily assembled for different content types.
+- **Testability:** Individual middleware components can be tested independently.
+- **Consistency:** Ensures a unified document format regardless of the source.
 
 ## Tools Layer
 
 
@@ -12,5 +12,17 @@
   },
   "files": {
     "include": ["src/**/*.ts"]
-  }
+  },
+  "overrides": [
+    {
+      "include": ["src/**/*.test.ts"],
+      "linter": {
+        "rules": {
+          "style": {
+            "noNonNullAssertion": "off"
+          }
+        }
+      }
+    }
+  ]
 }
@@ -35,6 +35,7 @@
     "db:push": "drizzle-kit push"
   },
   "dependencies": {
+    "@joplin/turndown-plugin-gfm": "^1.0.61",
     "@langchain/aws": "^0.1.8",
     "@langchain/community": "^0.3.34",
     "@langchain/google-genai": "^0.2.3",
 
@@ -5,7 +5,6 @@ import packageJson from "../package.json";
 import { DEFAULT_MAX_CONCURRENCY, DEFAULT_MAX_DEPTH, DEFAULT_MAX_PAGES } from "./config";
 import { PipelineManager } from "./pipeline/PipelineManager";
 import { FileFetcher, HttpFetcher } from "./scraper/fetcher";
-import { HtmlProcessor } from "./scraper/processor";
 import { DocumentManagementService } from "./store/DocumentManagementService";
 import {
   FetchUrlTool,
@@ -36,11 +35,7 @@ async function main() {
       findVersion: new FindVersionTool(docService),
       scrape: new ScrapeTool(docService, pipelineManager), // Pass manager
       search: new SearchTool(docService),
-      fetchUrl: new FetchUrlTool(
-        new HttpFetcher(),
-        new FileFetcher(),
-        new HtmlProcessor(),
-      ),
+      fetchUrl: new FetchUrlTool(new HttpFetcher(), new FileFetcher()),
     };
 
     const program = new Command();
 
@@ -7,7 +7,6 @@ import { DEFAULT_MAX_DEPTH, DEFAULT_MAX_PAGES } from "../config";
 import { PipelineManager } from "../pipeline/PipelineManager";
 import { PipelineJobStatus } from "../pipeline/types";
 import { FileFetcher, HttpFetcher } from "../scraper/fetcher";
-import { HtmlProcessor } from "../scraper/processor";
 import { DocumentManagementService } from "../store/DocumentManagementService";
 import {
   CancelJobTool,
@@ -49,11 +48,8 @@ export async function startServer() {
       getJobInfo: new GetJobInfoTool(pipelineManager),
       cancelJob: new CancelJobTool(pipelineManager),
       remove: new RemoveTool(docService),
-      fetchUrl: new FetchUrlTool(
-        new HttpFetcher(),
-        new FileFetcher(),
-        new HtmlProcessor(),
-      ),
+      // FetchUrlTool now uses middleware pipeline internally
+      fetchUrl: new FetchUrlTool(new HttpFetcher(), new FileFetcher()),
     };
 
     const server = new McpServer(