Skip to content

Commit 604175f

Browse files
committed
feat: add fetch-url tool to CLI and MCP server
- Add FetchUrlTool supporting both HTTP and local file URLs - Implement fetch-url CLI command with --no-follow-redirects flag - Add fetch_url MCP tool with followRedirects option - Add comprehensive tests covering HTTP and file URL scenarios - Update documentation in README.md and ARCHITECTURE.md Closes #34
1 parent 2f8eb4a commit 604175f

File tree

8 files changed

+516
-12
lines changed

8 files changed

+516
-12
lines changed

ARCHITECTURE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ This design allows easy addition of new embedding providers while maintaining co
212212

213213
**Database Location:** The application determines the database file (`documents.db`) location dynamically:
214214

215-
1. It first checks for a `.store` directory in the current working directory (`process.cwd()`). If `.store/documents.db` exists, it uses this path. This prioritizes local development databases.
215+
1. It first checks for a `.store` directory in the current project directory. If `.store/documents.db` exists, it uses this path. This prioritizes local development databases.
216216
2. If the local `.store/documents.db` does not exist, it defaults to a standard, OS-specific application data directory (e.g., `~/Library/Application Support/docs-mcp-server/` on macOS, `~/.local/share/docs-mcp-server/` on Linux) determined using the `env-paths` library. This ensures a stable, persistent location when running via `npx` or outside a local project context.
217217

218218
Documents are stored with URLs and sequential ordering to maintain source context:

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The server exposes MCP tools for:
2525
- Listing indexed libraries (`list_libraries`).
2626
- Finding appropriate versions (`find_version`).
2727
- Removing indexed documents (`remove_docs`).
28+
- Fetching single URLs (`fetch_url`): Fetches a URL and returns its content as Markdown.
2829

2930
## Configuration
3031

@@ -250,11 +251,31 @@ npx -y --package=@arabold/docs-mcp-server docs-cli --help
250251
```bash
251252
docs-cli scrape --help
252253
docs-cli search --help
254+
docs-cli fetch-url --help
253255
docs-cli find-version --help
254256
docs-cli remove --help
255257
docs-cli list --help
256258
```
257259

260+
### Fetching Single URLs (`fetch-url`)
261+
262+
Fetches a single URL and converts its content to Markdown. Unlike `scrape`, this command does not crawl links or store the content.
263+
264+
```bash
265+
docs-cli fetch-url <url> [options]
266+
```
267+
268+
**Options:**
269+
270+
- `--no-follow-redirects`: Disable following HTTP redirects (default: follow redirects)
271+
272+
**Examples:**
273+
274+
```bash
275+
# Fetch a URL and convert to Markdown
276+
docs-cli fetch-url https://example.com/page.html
277+
```
278+
258279
### Scraping Documentation (`scrape`)
259280

260281
Scrapes and indexes documentation from a given URL for a specific library.

src/cli.ts

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,16 @@ import "dotenv/config";
33
import { Command } from "commander";
44
import packageJson from "../package.json";
55
import { PipelineManager } from "./pipeline/PipelineManager";
6+
import { FileFetcher, HttpFetcher } from "./scraper/fetcher";
7+
import { HtmlProcessor } from "./scraper/processor";
68
import { DocumentManagementService } from "./store/DocumentManagementService";
7-
import { FindVersionTool, ListLibrariesTool, ScrapeTool, SearchTool } from "./tools";
9+
import {
10+
FetchUrlTool,
11+
FindVersionTool,
12+
ListLibrariesTool,
13+
ScrapeTool,
14+
SearchTool,
15+
} from "./tools";
816
import { LogLevel, setLogLevel } from "./utils/logger";
917

1018
const formatOutput = (data: unknown) => JSON.stringify(data, null, 2);
@@ -27,6 +35,11 @@ async function main() {
2735
findVersion: new FindVersionTool(docService),
2836
scrape: new ScrapeTool(docService, pipelineManager), // Pass manager
2937
search: new SearchTool(docService),
38+
fetchUrl: new FetchUrlTool(
39+
new HttpFetcher(),
40+
new FileFetcher(),
41+
new HtmlProcessor(),
42+
),
3043
};
3144

3245
const program = new Command();
@@ -184,6 +197,21 @@ async function main() {
184197
}
185198
});
186199

200+
program
201+
.command("fetch-url <url>")
202+
.description("Fetch a URL and convert its content to Markdown")
203+
.option(
204+
"--no-follow-redirects",
205+
"Disable following HTTP redirects (default: follow redirects)",
206+
)
207+
.action(async (url, options) => {
208+
const content = await tools.fetchUrl.execute({
209+
url,
210+
followRedirects: options.followRedirects,
211+
});
212+
console.log(content);
213+
});
214+
187215
// Hook to set log level after parsing global options but before executing command action
188216
program.hook("preAction", (thisCommand) => {
189217
// Global options are attached to the program (thisCommand)

src/mcp/index.ts

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
55
import { z } from "zod";
66
import { PipelineManager } from "../pipeline/PipelineManager";
77
import { PipelineJobStatus } from "../pipeline/types";
8+
import { FileFetcher, HttpFetcher } from "../scraper/fetcher";
9+
import { HtmlProcessor } from "../scraper/processor";
810
import { DocumentManagementService } from "../store/DocumentManagementService";
911
import {
1012
CancelJobTool,
13+
FetchUrlTool,
1114
FindVersionTool,
1215
GetJobInfoTool,
1316
ListJobsTool,
@@ -39,15 +42,17 @@ export async function startServer() {
3942
const tools = {
4043
listLibraries: new ListLibrariesTool(docService),
4144
findVersion: new FindVersionTool(docService),
42-
// TODO: Update ScrapeTool constructor if needed to accept PipelineManager
43-
// ScrapeTool currently uses docService.getPipelineManager() which doesn't exist.
44-
// Pass both docService and pipelineManager to ScrapeTool constructor
4545
scrape: new ScrapeTool(docService, pipelineManager),
4646
search: new SearchTool(docService),
4747
listJobs: new ListJobsTool(pipelineManager),
4848
getJobInfo: new GetJobInfoTool(pipelineManager),
4949
cancelJob: new CancelJobTool(pipelineManager),
50-
remove: new RemoveTool(docService), // Instantiate RemoveTool
50+
remove: new RemoveTool(docService),
51+
fetchUrl: new FetchUrlTool(
52+
new HttpFetcher(),
53+
new FileFetcher(),
54+
new HtmlProcessor(),
55+
),
5156
};
5257

5358
const server = new McpServer(
@@ -305,6 +310,30 @@ ${formattedResults.join("")}`,
305310
},
306311
);
307312

313+
// Fetch URL tool
314+
server.tool(
315+
"fetch_url",
316+
"Fetch a single URL and convert its content to Markdown",
317+
{
318+
url: z.string().url().describe("The URL to fetch and convert to markdown"),
319+
followRedirects: z
320+
.boolean()
321+
.optional()
322+
.default(true)
323+
.describe("Whether to follow HTTP redirects (3xx responses)"),
324+
},
325+
async ({ url, followRedirects }) => {
326+
try {
327+
const result = await tools.fetchUrl.execute({ url, followRedirects });
328+
return createResponse(result);
329+
} catch (error) {
330+
return createError(
331+
`Failed to fetch URL: ${error instanceof Error ? error.message : String(error)}`,
332+
);
333+
}
334+
},
335+
);
336+
308337
// Cancel job tool
309338
server.tool(
310339
"cancel_job",

src/scraper/processor/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import type { RawContent } from "../fetcher/types";
22

3+
export type { RawContent };
4+
35
/**
46
* Processed content in normalized form (markdown)
57
* with extracted metadata and references

0 commit comments

Comments
 (0)