Merge pull request #27 from arabold/feat/26-handle-model-dimensions

arabold · web-flow · commit 636978f5e8e7 · 2025-04-11T07:13:18.000-07:00
feat(#26): Support custom OpenAI API endpoints and handle model dimensions
diff --git a/.env.example b/.env.example
@@ -1,6 +1,19 @@
-# OpenAI
+# OpenAI Configuration
+# Required: Your OpenAI API Key
 OPENAI_API_KEY=your-key-here
 
+# Optional: Your OpenAI Organization ID (handled automatically by LangChain if set)
+OPENAI_ORG_ID=
+
+# Optional: Custom base URL for OpenAI API (e.g., for Azure OpenAI or compatible APIs)
+OPENAI_API_BASE=
+
+# Optional: Embedding model name (defaults to "text-embedding-3-small")
+# Must produce vectors with ≤1536 dimensions (smaller dimensions are padded with zeros)
+# Examples: text-embedding-3-small (1536), text-embedding-ada-002 (1536)
+# Note: text-embedding-3-large (3072) is not supported due to dimension limit
+DOCS_MCP_EMBEDDING_MODEL=
+
 # Optional: Specify a custom directory to store the SQLite database file (documents.db).
 # If set, this path takes precedence over the default locations.
 # Default behavior (if unset):
diff --git a/Dockerfile b/Dockerfile
@@ -31,7 +31,12 @@ COPY --from=builder /app/dist ./dist
 RUN ln -s /app/dist/cli.js /app/docs-cli
 
 # Define the data directory environment variable and volume
+# Environment variables
 ENV DOCS_MCP_STORE_PATH=/data
+ENV OPENAI_API_BASE=
+ENV OPENAI_ORG_ID=
+ENV DOCS_MCP_EMBEDDING_MODEL=
+
 VOLUME /data
 
 # Set the command to run the application
diff --git a/README.md b/README.md
@@ -26,6 +26,19 @@ The server exposes MCP tools for:
 - Finding appropriate versions (`find_version`).
 - Removing indexed documents (`remove_docs`).
 
+## Configuration
+
+The following environment variables are supported to configure the OpenAI API and embedding behavior:
+
+- `OPENAI_API_KEY`: **Required.** Your OpenAI API key for generating embeddings.
+- `OPENAI_ORG_ID`: **Optional.** Your OpenAI Organization ID (handled automatically by LangChain if set).
+- `OPENAI_API_BASE`: **Optional.** Custom base URL for OpenAI API (e.g., for Azure OpenAI or compatible APIs).
+- `DOCS_MCP_EMBEDDING_MODEL`: **Optional.** Embedding model name (defaults to "text-embedding-3-small"). Must produce vectors with ≤1536 dimensions. Smaller dimensions are automatically padded with zeros.
+
+The database schema uses a fixed dimension of 1536 for embedding vectors. Models that produce larger vectors are not supported and will cause an error. Models with smaller vectors (e.g., older embedding models) are automatically padded with zeros to match the required dimension.
+
+These variables can be set regardless of how you run the server (Docker, npx, or from source).
+
 ## Running the MCP Server
 
 There are two ways to run the docs-mcp-server:
@@ -76,6 +89,17 @@ This is the recommended approach for most users. It's easy, straightforward, and
 - `-e OPENAI_API_KEY`: **Required.** Set your OpenAI API key.
 - `-v docs-mcp-data:/data`: **Required for persistence.** Mounts a Docker named volume `docs-mcp-data` to store the database. You can replace with a specific host path if preferred (e.g., `-v /path/on/host:/data`).
 
+Any of the configuration environment variables (see [Configuration](#configuration) above) can be passed to the container using the `-e` flag. For example:
+
+```bash
+docker run -i --rm \
+  -e OPENAI_API_KEY="your-key-here" \
+  -e DOCS_MCP_EMBEDDING_MODEL="text-embedding-3-large" \
+  -e OPENAI_API_BASE="http://your-api-endpoint" \
+  -v docs-mcp-data:/data \
+  ghcr.io/arabold/docs-mcp-server:latest
+```
+
 ### Option 2: Using npx
 
 This approach is recommended when you need local file access (e.g., indexing documentation from your local file system). While this can also be achieved by mounting paths into a Docker container, using npx is simpler but requires a Node.js installation.
@@ -122,7 +146,7 @@ docker run --rm \
   docs-cli <command> [options]
 ```
 
-Make sure to use the same volume name (`docs-mcp-data` in this example) as you did for the server.
+Make sure to use the same volume name (`docs-mcp-data` in this example) as you did for the server. Any of the configuration environment variables (see [Configuration](#configuration) above) can be passed using `-e` flags, just like with the server.
 
 ### Using npx CLI
 
@@ -339,6 +363,16 @@ This method is useful for contributing to the project or running un-published ve
    # Required: Your OpenAI API key for generating embeddings.
    OPENAI_API_KEY=your-api-key-here
 
+   # Optional: Your OpenAI Organization ID (handled automatically by LangChain if set)
+   OPENAI_ORG_ID=
+
+   # Optional: Custom base URL for OpenAI API (e.g., for Azure OpenAI or compatible APIs)
+   OPENAI_API_BASE=
+
+   # Optional: Embedding model name (defaults to "text-embedding-3-small")
+   # Examples: text-embedding-3-large, text-embedding-ada-002
+   DOCS_MCP_EMBEDDING_MODEL=
+
    # Optional: Specify a custom directory to store the SQLite database file (documents.db).
    # If set, this path takes precedence over the default locations.
    # Default behavior (if unset):
diff --git a/package.json b/package.json
@@ -26,7 +26,9 @@
     "dev:cli": "npm run build && node --enable-source-maps dist/cli.js",
     "server": "node --enable-source-maps --watch dist/server.js",
     "dev:server": "run-p \"build -- --watch\" \"server\"",
-    "test": "vitest",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "test:coverage": "vitest run --coverage",
     "lint": "biome check .",
     "format": "biome format . --write",
     "db:generate": "drizzle-kit generate",
diff --git a/src/mcp/index.ts b/src/mcp/index.ts
@@ -91,7 +91,7 @@ export async function startServer() {
           .default(true)
           .describe("Whether to follow HTTP redirects (3xx responses)"),
       },
-      // Remove context as it's not used without progress reporting
+
       async ({ url, library, version, maxPages, maxDepth, scope, followRedirects }) => {
         try {
           // Execute scrape tool without waiting and without progress callback
diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts
@@ -23,7 +23,11 @@ const mockPrepare = vi.fn().mockReturnValue(mockStatement);
 const mockDb = {
   prepare: mockPrepare,
   exec: vi.fn(),
-  transaction: vi.fn((fn) => fn()),
+  transaction: vi.fn(
+    (fn) =>
+      (...args: unknown[]) =>
+        fn(...args),
+  ),
   close: vi.fn(),
 };
 vi.mock("better-sqlite3", () => ({
@@ -58,6 +62,9 @@ describe("DocumentStore", () => {
     }));
     mockPrepare.mockReturnValue(mockStatement); // <-- Re-configure prepare mock return value
 
+    // Reset embedQuery to handle initialization vector
+    mockEmbedQuery.mockResolvedValue(new Array(1536).fill(0.1));
+
     // Now create the store and initialize.
     // initialize() will call 'new OpenAIEmbeddings()', which uses our fresh mock implementation.
     documentStore = new DocumentStore(":memory:");
@@ -79,9 +86,10 @@ describe("DocumentStore", () => {
 
       await documentStore.findByContent(library, version, query, limit);
 
-      // 1. Check if embedQuery was called
-      expect(mockEmbedQuery).toHaveBeenCalledWith(query);
-      expect(mockEmbedQuery).toHaveBeenCalledTimes(1);
+      // 1. Check if embedQuery was called with correct args
+      // Note: embedQuery is called twice - once during init and once for search
+      const embedCalls = mockEmbedQuery.mock.calls;
+      expect(embedCalls[embedCalls.length - 1][0]).toBe(query); // Last call should be our search
 
       // 2. Check if db.prepare was called correctly during findByContent
       // It's called multiple times during initialize, so check the specific call
@@ -150,4 +158,77 @@ describe("DocumentStore", () => {
       expect(lastCallArgs?.[6]).toBe(expectedFtsQuery);
     });
   });
+
+  describe("Embedding Model Dimensions", () => {
+    it("should accept a model that produces 1536-dimensional vectors", async () => {
+      // Mock a 1536-dimensional vector
+      mockEmbedQuery.mockResolvedValueOnce(new Array(1536).fill(0.1));
+      documentStore = new DocumentStore(":memory:");
+      await expect(documentStore.initialize()).resolves.not.toThrow();
+    });
+
+    it("should accept and pad vectors from models with smaller dimensions", async () => {
+      // Mock 768-dimensional vectors
+      mockEmbedQuery.mockResolvedValueOnce(new Array(768).fill(0.1));
+      mockEmbedDocuments.mockResolvedValueOnce([new Array(768).fill(0.1)]);
+
+      documentStore = new DocumentStore(":memory:");
+      await documentStore.initialize();
+
+      // Should pad to 1536 when inserting
+      const doc = {
+        pageContent: "test content",
+        metadata: { title: "test", url: "http://test.com", path: ["test"] },
+      };
+
+      // This should succeed (vectors are padded internally)
+      await expect(
+        documentStore.addDocuments("test-lib", "1.0.0", [doc]),
+      ).resolves.not.toThrow();
+    });
+
+    it("should reject models that produce vectors larger than 1536 dimensions", async () => {
+      // Mock a 3072-dimensional vector (like text-embedding-3-large)
+      mockEmbedQuery.mockResolvedValueOnce(new Array(3072).fill(0.1));
+      documentStore = new DocumentStore(":memory:");
+      await expect(documentStore.initialize()).rejects.toThrow(/exceeds.*1536/);
+    });
+
+    it("should pad both document and query vectors consistently", async () => {
+      // Mock 768-dimensional vectors for both init and subsequent operations
+      const smallVector = new Array(768).fill(0.1);
+      mockEmbedQuery
+        .mockResolvedValueOnce(smallVector) // for initialization
+        .mockResolvedValueOnce(smallVector); // for search query
+      mockEmbedDocuments.mockResolvedValueOnce([smallVector]); // for document embeddings
+
+      documentStore = new DocumentStore(":memory:");
+      await documentStore.initialize();
+
+      const doc = {
+        pageContent: "test content",
+        metadata: { title: "test", url: "http://test.com", path: ["test"] },
+      };
+
+      // Add a document (this pads the document vector)
+      await documentStore.addDocuments("test-lib", "1.0.0", [doc]);
+
+      // Search should work (query vector gets padded too)
+      await expect(
+        documentStore.findByContent("test-lib", "1.0.0", "test query", 5),
+      ).resolves.not.toThrow();
+
+      // Verify both vectors were padded (via the JSON stringification)
+      const insertCall = mockStatement.run.mock.calls.find(
+        (call) => call[0]?.toString().startsWith("1"), // Looking for rowid=1
+      );
+      const searchCall = mockStatementAll.mock.lastCall;
+
+      // Both vectors should be stringified arrays of length 1536
+      const insertVector = JSON.parse(insertCall?.[3] || "[]");
+      const searchVector = JSON.parse(searchCall?.[2] || "[]");
+      expect(insertVector.length).toBe(1536);
+      expect(searchVector.length).toBe(1536);
+    });
+  });
 });
diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts
@@ -3,7 +3,7 @@ import { OpenAIEmbeddings } from "@langchain/openai";
 import Database, { type Database as DatabaseType } from "better-sqlite3";
 import * as sqliteVec from "sqlite-vec";
 import type { DocumentMetadata } from "../types";
-import { ConnectionError, StoreError } from "./errors";
+import { ConnectionError, DimensionError, StoreError } from "./errors";
 import { createTablesSQL } from "./schema";
 import { type DbDocument, type DbQueryResult, mapDbDocumentToDocument } from "./types";
 
@@ -27,6 +27,8 @@ interface RankedResult extends RawSearchResult {
 export class DocumentStore {
   private readonly db: DatabaseType;
   private embeddings!: OpenAIEmbeddings;
+  private readonly dbDimension: number = 1536; // Fixed dimension from schema.ts
+  private modelDimension!: number;
   private statements!: {
     getById: Database.Statement;
     insertDocument: Database.Statement;
@@ -167,14 +169,53 @@ export class DocumentStore {
   }
 
   /**
-   * Initializes embeddings client
+   * Pads a vector to the fixed database dimension by appending zeros.
+   * Throws an error if the input vector is longer than the database dimension.
    */
-  private initializeEmbeddings(): void {
-    this.embeddings = new OpenAIEmbeddings({
-      modelName: "text-embedding-3-small",
+  private padVector(vector: number[]): number[] {
+    if (vector.length > this.dbDimension) {
+      throw new Error(
+        `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`,
+      );
+    }
+    if (vector.length === this.dbDimension) {
+      return vector;
+    }
+    return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
+  }
+
+  /**
+   * Initializes embeddings client using environment variables for configuration.
+   *
+   * Supports:
+   * - OPENAI_API_KEY (handled automatically by LangChain)
+   * - OPENAI_ORG_ID (handled automatically by LangChain)
+   * - DOCS_MCP_EMBEDDING_MODEL (optional, defaults to "text-embedding-3-small")
+   * - OPENAI_API_BASE (optional)
+   */
+  private async initializeEmbeddings(): Promise<void> {
+    const modelName = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
+    const baseURL = process.env.OPENAI_API_BASE;
+
+    const config: ConstructorParameters<typeof OpenAIEmbeddings>[0] = {
       stripNewLines: true,
       batchSize: 512,
-    });
+      modelName,
+    };
+
+    if (baseURL) {
+      config.configuration = { baseURL };
+    }
+
+    this.embeddings = new OpenAIEmbeddings(config);
+
+    // Determine the model's actual dimension by embedding a test string
+    const testVector = await this.embeddings.embedQuery("test");
+    this.modelDimension = testVector.length;
+
+    if (this.modelDimension > this.dbDimension) {
+      throw new DimensionError(modelName, this.modelDimension, this.dbDimension);
+    }
   }
 
   /**
@@ -202,9 +243,13 @@ export class DocumentStore {
       // 3. Initialize prepared statements
       this.prepareStatements();
 
-      // 4. Initialize embeddings client
-      this.initializeEmbeddings();
+      // 4. Initialize embeddings client (await to catch errors)
+      await this.initializeEmbeddings();
     } catch (error) {
+      // Re-throw StoreError directly, wrap others in ConnectionError
+      if (error instanceof StoreError) {
+        throw error;
+      }
       throw new ConnectionError("Failed to initialize database connection", error);
     }
   }
@@ -281,7 +326,8 @@ export class DocumentStore {
         const header = `<title>${doc.metadata.title}</title>\n<url>${doc.metadata.url}</url>\n<path>${doc.metadata.path.join(" / ")}</path>\n`;
         return `${header}${doc.pageContent}`;
       });
-      const embeddings = await this.embeddings.embedDocuments(texts);
+      const rawEmbeddings = await this.embeddings.embedDocuments(texts);
+      const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
 
       // Insert documents in a transaction
       const transaction = this.db.transaction((docs: typeof documents) => {
@@ -308,7 +354,7 @@ export class DocumentStore {
             BigInt(rowId),
             library.toLowerCase(),
             version.toLowerCase(),
-            JSON.stringify(embeddings[i]),
+            JSON.stringify(paddedEmbeddings[i]),
           );
         }
       });
@@ -364,7 +410,8 @@ export class DocumentStore {
     limit: number,
   ): Promise<Document[]> {
     try {
-      const embedding = await this.embeddings.embedQuery(query);
+      const rawEmbedding = await this.embeddings.embedQuery(query);
+      const embedding = this.padVector(rawEmbedding);
       const ftsQuery = this.escapeFtsQuery(query); // Escape the query for FTS
 
       const stmt = this.db.prepare(`
diff --git a/src/store/errors.ts b/src/store/errors.ts
@@ -14,6 +14,20 @@ class StoreError extends Error {
   }
 }
 
+class DimensionError extends StoreError {
+  constructor(
+    public readonly modelName: string,
+    public readonly modelDimension: number,
+    public readonly dbDimension: number,
+  ) {
+    super(
+      `Model "${modelName}" produces ${modelDimension}-dimensional vectors, ` +
+        `which exceeds the database's fixed dimension of ${dbDimension}. ` +
+        `Please use a model with dimension ≤ ${dbDimension}.`,
+    );
+  }
+}
+
 class ConnectionError extends StoreError {}
 
 class DocumentNotFoundError extends StoreError {
@@ -22,4 +36,4 @@ class DocumentNotFoundError extends StoreError {
   }
 }
 
-export { StoreError, ConnectionError, DocumentNotFoundError };
+export { StoreError, ConnectionError, DocumentNotFoundError, DimensionError };

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,20 @@ class StoreError extends Error {`
`14`	`14`	`}`
`15`	`15`	`}`
`16`	`16`
	`17`	`+class DimensionError extends StoreError {`
	`18`	`+ constructor(`
	`19`	`+ public readonly modelName: string,`
	`20`	`+ public readonly modelDimension: number,`
	`21`	`+ public readonly dbDimension: number,`
	`22`	`+ ) {`
	`23`	`+ super(`
	`24`	+ `Model "${modelName}" produces ${modelDimension}-dimensional vectors, ` +
	`25`	+ `which exceeds the database's fixed dimension of ${dbDimension}. ` +
	`26`	+ `Please use a model with dimension ≤ ${dbDimension}.`,
	`27`	`+ );`
	`28`	`+ }`
	`29`	`+}`
	`30`	`+`
`17`	`31`	`class ConnectionError extends StoreError {}`
`18`	`32`
`19`	`33`	`class DocumentNotFoundError extends StoreError {`
`@@ -22,4 +36,4 @@ class DocumentNotFoundError extends StoreError {`
`22`	`36`	`}`
`23`	`37`	`}`
`24`	`38`
`25`		`-export { StoreError, ConnectionError, DocumentNotFoundError };`
	`39`	`+export { StoreError, ConnectionError, DocumentNotFoundError, DimensionError };`