From 135325245c9b3ebc69e44a4191449b8c65d1d4d2 Mon Sep 17 00:00:00 2001 From: Helweg Date: Thu, 9 Apr 2026 08:48:33 +0200 Subject: [PATCH 01/20] feat: add pluggable reranker config --- src/config/schema.ts | 104 ++++++++++++++++++++++++++++--------------- tests/config.test.ts | 69 ++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 36 deletions(-) diff --git a/src/config/schema.ts b/src/config/schema.ts index f71d3dc..be005ff 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -49,6 +49,25 @@ export interface SearchConfig { contextLines: number; } +export type RerankerProvider = "cohere" | "jina" | "custom"; + +export interface RerankerConfig { + /** Whether to enable reranking. Default: false */ + enabled: boolean; + /** Provider shortcut for hosted rerank APIs. Use 'custom' to provide only baseUrl. */ + provider: RerankerProvider; + /** Model name for reranking */ + model: string; + /** Base URL of the rerank API endpoint */ + baseUrl: string; + /** API key for the rerank service */ + apiKey?: string; + /** Number of top documents to rerank */ + topN: number; + /** Request timeout in milliseconds */ + timeoutMs: number; +} + export type LogLevel = "error" | "warn" | "info" | "debug"; export interface DebugConfig { @@ -83,21 +102,6 @@ export interface CustomProviderConfig { max_batch_size?: number; } -export interface RerankerConfig { - /** Whether to enable reranking. Default: false */ - enabled: boolean; - /** Base URL of the rerank API endpoint (e.g. "https://api.siliconflow.cn/v1") */ - baseUrl: string; - /** Model name for reranking (e.g. "BAAI/bge-reranker-v2-m3") */ - model: string; - /** API key for the rerank service */ - apiKey?: string; - /** Number of top documents to rerank. Default: 20 */ - topN?: number; - /** Request timeout in milliseconds. Default: 30000 */ - timeoutMs?: number; -} - export interface CodebaseIndexConfig { embeddingProvider: EmbeddingProvider | 'custom' | 'auto'; embeddingModel?: EmbeddingModelName; @@ -164,6 +168,21 @@ function isValidFusionStrategy(value: unknown): value is SearchConfig["fusionStr return value === "weighted" || value === "rrf"; } +function isValidRerankerProvider(value: unknown): value is RerankerProvider { + return value === "cohere" || value === "jina" || value === "custom"; +} + +function getDefaultRerankerBaseUrl(provider: RerankerProvider): string { + switch (provider) { + case "cohere": + return "https://api.cohere.ai/v1"; + case "jina": + return "https://api.jina.ai/v1"; + case "custom": + return ""; + } +} + function getDefaultDebugConfig(): DebugConfig { return { enabled: false, @@ -177,16 +196,6 @@ function getDefaultDebugConfig(): DebugConfig { }; } -function getDefaultRerankerConfig(): RerankerConfig { - return { - enabled: false, - baseUrl: "https://api.siliconflow.cn/v1", - model: "BAAI/bge-reranker-v2-m3", - topN: 20, - timeoutMs: 30000, - }; -} - const VALID_SCOPES: IndexScope[] = ["project", "global"]; const VALID_LOG_LEVELS: LogLevel[] = ["error", "warn", "info", "debug"]; @@ -282,17 +291,6 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { metrics: typeof rawDebug.metrics === "boolean" ? rawDebug.metrics : defaultDebug.metrics, }; - const defaultReranker = getDefaultRerankerConfig(); - const rawReranker = (input.reranker && typeof input.reranker === "object" ? input.reranker : {}) as Record; - const reranker: RerankerConfig = { - enabled: typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : defaultReranker.enabled, - baseUrl: typeof rawReranker.baseUrl === "string" ? rawReranker.baseUrl.trim().replace(/\/+$/, '') : defaultReranker.baseUrl, - model: typeof rawReranker.model === "string" ? rawReranker.model : defaultReranker.model, - apiKey: getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey"), - topN: typeof rawReranker.topN === "number" ? Math.max(1, Math.min(200, Math.floor(rawReranker.topN))) : defaultReranker.topN, - timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, rawReranker.timeoutMs) : defaultReranker.timeoutMs, - }; - const rawKnowledgeBases = input.knowledgeBases; const knowledgeBases: string[] = isStringArray(rawKnowledgeBases) ? rawKnowledgeBases.filter(p => typeof p === "string" && p.trim().length > 0).map(p => p.trim()) @@ -306,6 +304,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { let embeddingProvider: EmbeddingProvider | 'custom' | 'auto'; let embeddingModel: EmbeddingModelName | undefined = undefined; let customProvider: CustomProviderConfig | undefined = undefined; + let reranker: RerankerConfig | undefined = undefined; if (embeddingProviderValue === 'custom') { embeddingProvider = 'custom'; @@ -359,6 +358,39 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { embeddingProvider = 'auto'; } + const rawReranker = (input.reranker && typeof input.reranker === "object" + ? input.reranker + : {}) as Record; + const rerankerEnabled = typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : false; + if (rerankerEnabled) { + const provider = isValidRerankerProvider(rawReranker.provider) ? rawReranker.provider : "custom"; + const model = getResolvedString(rawReranker.model, "$root.reranker.model"); + if (!model || model.trim().length === 0) { + throw new Error("reranker is enabled but reranker.model is missing or invalid."); + } + + const configuredBaseUrl = getResolvedString(rawReranker.baseUrl, "$root.reranker.baseUrl"); + const baseUrl = configuredBaseUrl?.trim() || getDefaultRerankerBaseUrl(provider); + if (baseUrl.length === 0) { + throw new Error("reranker is enabled but reranker.baseUrl is missing or invalid for provider 'custom'."); + } + + const apiKey = getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey"); + if ((provider === "cohere" || provider === "jina") && (!apiKey || apiKey.trim().length === 0)) { + throw new Error(`reranker provider '${provider}' requires reranker.apiKey when enabled.`); + } + + reranker = { + enabled: true, + provider, + model: model.trim(), + baseUrl: baseUrl.replace(/\/+$/, ""), + apiKey: apiKey?.trim() || undefined, + topN: typeof rawReranker.topN === "number" ? Math.min(50, Math.max(1, Math.floor(rawReranker.topN))) : 15, + timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, Math.floor(rawReranker.timeoutMs)) : 10000, + }; + } + return { embeddingProvider, embeddingModel, diff --git a/tests/config.test.ts b/tests/config.test.ts index 59cfc7f..ac71f5d 100644 --- a/tests/config.test.ts +++ b/tests/config.test.ts @@ -321,6 +321,75 @@ describe("config schema", () => { it("should handle non-object search", () => { expect(parseConfig({ search: "invalid" }).search.maxResults).toBe(20); }); + + it("should parse reranker config when enabled", () => { + const config = parseConfig({ + reranker: { + enabled: true, + provider: "cohere", + model: "rerank-v3.5", + apiKey: "test-key", + topN: 12, + timeoutMs: 4000, + }, + }); + + expect(config.reranker).toEqual({ + enabled: true, + provider: "cohere", + model: "rerank-v3.5", + baseUrl: "https://api.cohere.ai/v1", + apiKey: "test-key", + topN: 12, + timeoutMs: 4000, + }); + }); + + it("should require model for enabled reranker", () => { + expect(() => parseConfig({ + reranker: { + enabled: true, + provider: "cohere", + apiKey: "test-key", + }, + })).toThrow("reranker is enabled but reranker.model is missing or invalid."); + }); + + it("should require apiKey for hosted reranker providers", () => { + expect(() => parseConfig({ + reranker: { + enabled: true, + provider: "jina", + model: "jina-reranker-v2-base-multilingual", + }, + })).toThrow("reranker provider 'jina' requires reranker.apiKey when enabled."); + }); + + it("should require baseUrl for custom reranker provider", () => { + expect(() => parseConfig({ + reranker: { + enabled: true, + provider: "custom", + model: "custom-reranker", + }, + })).toThrow("reranker is enabled but reranker.baseUrl is missing or invalid for provider 'custom'."); + }); + + it("should clamp reranker topN and timeoutMs", () => { + const config = parseConfig({ + reranker: { + enabled: true, + provider: "custom", + model: "custom-reranker", + baseUrl: "https://rerank.example/v1", + topN: 999, + timeoutMs: 100, + }, + }); + + expect(config.reranker?.topN).toBe(50); + expect(config.reranker?.timeoutMs).toBe(1000); + }); }); describe("custom provider config", () => { From 65945ba034116b310c5e230267c6677bba7a008f Mon Sep 17 00:00:00 2001 From: Helweg Date: Thu, 9 Apr 2026 08:48:56 +0200 Subject: [PATCH 02/20] feat: add optional external reranker stage --- src/indexer/index.ts | 132 +++++++++++++++++++++++++++++++- tests/retrieval-ranking.test.ts | 88 +++++++++++++++++++++ 2 files changed, 218 insertions(+), 2 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index b5ccaf2..b1b2c31 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -4,7 +4,7 @@ import { performance } from "perf_hooks"; import PQueue from "p-queue"; import pRetry from "p-retry"; -import { ParsedCodebaseIndexConfig } from "../config/schema.js"; +import { ParsedCodebaseIndexConfig, type RerankerConfig } from "../config/schema.js"; import { detectEmbeddingProvider, ConfiguredProviderInfo, tryDetectProvider, createCustomProviderInfo } from "../embeddings/detector.js"; import { createEmbeddingProvider, @@ -158,6 +158,11 @@ interface FailedBatch { type RankedCandidate = { id: string; score: number; metadata: ChunkMetadata }; +interface RerankDocumentPayload { + id: string; + text: string; +} + interface HybridRankOptions { fusionStrategy: "weighted" | "rrf"; rrfK: number; @@ -343,6 +348,20 @@ function splitNameTokens(name: string): Set { return tokens; } +function createRerankerDocumentText(candidate: RankedCandidate): string { + const parts = [ + `path: ${candidate.metadata.filePath}`, + `chunk_type: ${candidate.metadata.chunkType}`, + `language: ${candidate.metadata.language}`, + ]; + + if (candidate.metadata.name) { + parts.push(`name: ${candidate.metadata.name}`); + } + + return parts.join("\n"); +} + function chunkTypeBoost(chunkType: string): number { switch (chunkType) { case "function": @@ -1448,6 +1467,114 @@ export class Indexer { } } + private async rerankCandidatesWithApi( + query: string, + candidates: RankedCandidate[] + ): Promise { + const reranker = this.config.reranker; + if (!reranker || !reranker.enabled || candidates.length <= 1) { + return candidates; + } + + const topN = Math.min(reranker.topN, candidates.length); + const head = candidates.slice(0, topN); + const tail = candidates.slice(topN); + const documents = head.map((candidate) => ({ + id: candidate.id, + text: createRerankerDocumentText(candidate), + })); + + try { + const rankedIds = await this.callExternalReranker(query, documents, reranker); + if (rankedIds.length === 0) { + return candidates; + } + + const order = new Map(rankedIds.map((id, index) => [id, index])); + const rerankedHead = [...head].sort((a, b) => { + const aRank = order.get(a.id) ?? Number.MAX_SAFE_INTEGER; + const bRank = order.get(b.id) ?? Number.MAX_SAFE_INTEGER; + if (aRank !== bRank) { + return aRank - bRank; + } + if (b.score !== a.score) { + return b.score - a.score; + } + return a.id.localeCompare(b.id); + }); + + this.logger.search("debug", "Applied external reranker", { + provider: reranker.provider, + model: reranker.model, + candidateCount: head.length, + }); + + return [...rerankedHead, ...tail]; + } catch (error) { + this.logger.search("warn", "External reranker failed; using deterministic order", { + provider: reranker.provider, + model: reranker.model, + error: getErrorMessage(error), + }); + return candidates; + } + } + + private async callExternalReranker( + query: string, + documents: RerankDocumentPayload[], + reranker: RerankerConfig + ): Promise { + const headers: Record = { + "Content-Type": "application/json", + }; + if (reranker.apiKey) { + headers.Authorization = `Bearer ${reranker.apiKey}`; + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), reranker.timeoutMs); + try { + const response = await fetch(`${reranker.baseUrl}/rerank`, { + method: "POST", + headers, + body: JSON.stringify({ + model: reranker.model, + query, + documents: documents.map((document) => document.text), + top_n: documents.length, + return_documents: false, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`Reranker API error: ${response.status} - ${await response.text()}`); + } + + const body = await response.json() as { + results?: Array<{ index?: number; relevance_score?: number }>; + }; + if (!Array.isArray(body.results)) { + throw new Error("Reranker API returned unexpected response format."); + } + + return body.results + .map((result) => { + const index = typeof result.index === "number" ? result.index : -1; + return documents[index]?.id; + }) + .filter((id): id is string => typeof id === "string"); + } catch (error) { + if (error instanceof Error && error.name === "AbortError") { + throw new Error(`Reranker request timed out after ${reranker.timeoutMs}ms`); + } + throw error; + } finally { + clearTimeout(timeout); + } + } + async initialize(): Promise { if (this.config.embeddingProvider === 'custom') { if (!this.config.customProvider) { @@ -2477,11 +2604,12 @@ export class Indexer { hybridWeight, prioritizeSourcePaths: sourceIntent, }); + const rerankedCombined = await this.rerankCandidatesWithApi(query, combined); const fusionMs = performance.now() - fusionStartTime; const rescued = promoteIdentifierMatches( query, - combined, + rerankedCombined, semanticCandidates, keywordCandidates, database, diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index 11e1900..dab710a 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest"; import type { ChunkMetadata } from "../src/native/index.js"; import { + Indexer, extractFilePathHint, fuseResultsRrf, fuseResultsWeighted, @@ -11,6 +12,7 @@ import { stripFilePathHint, rerankResults, } from "../src/indexer/index.js"; +import { parseConfig } from "../src/config/schema.js"; type Candidate = { id: string; score: number; metadata: ChunkMetadata }; @@ -333,4 +335,90 @@ describe("retrieval ranking", () => { const query = "where is createSystem implementation in packages/react/src/styled-system/system.ts"; expect(stripFilePathHint(query)).toBe("where is createSystem implementation"); }); + + it("applies external reranker ordering when configured", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 3, + }, + }); + const indexer = new Indexer("/repo", config); + + const fetchSpy = globalThis.fetch; + globalThis.fetch = (async (input) => { + if (String(input).includes("/rerank")) { + return new Response(JSON.stringify({ + results: [ + { index: 2, relevance_score: 0.99 }, + { index: 0, relevance_score: 0.72 }, + { index: 1, relevance_score: 0.4 }, + ], + }), { status: 200 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "first", score: 0.9, metadata: meta({ filePath: "/repo/src/first.ts", name: "firstThing", chunkType: "function" }) }, + { id: "second", score: 0.89, metadata: meta({ filePath: "/repo/src/second.ts", name: "secondThing", chunkType: "function" }) }, + { id: "third", score: 0.88, metadata: meta({ filePath: "/repo/src/third.ts", name: "thirdThing", chunkType: "function" }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi(query: string, items: Candidate[]): Promise; + }).rerankCandidatesWithApi("find third thing", candidates); + + expect(reranked.map((candidate) => candidate.id)).toEqual(["third", "first", "second"]); + globalThis.fetch = fetchSpy; + }); + + it("falls back to deterministic order when external reranker fails", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 2, + }, + }); + const indexer = new Indexer("/repo", config); + + const fetchSpy = globalThis.fetch; + globalThis.fetch = (async (input) => { + if (String(input).includes("/rerank")) { + return new Response("boom", { status: 500 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "first", score: 0.9, metadata: meta({ filePath: "/repo/src/first.ts", name: "firstThing", chunkType: "function" }) }, + { id: "second", score: 0.89, metadata: meta({ filePath: "/repo/src/second.ts", name: "secondThing", chunkType: "function" }) }, + { id: "third", score: 0.88, metadata: meta({ filePath: "/repo/src/third.ts", name: "thirdThing", chunkType: "function" }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi(query: string, items: Candidate[]): Promise; + }).rerankCandidatesWithApi("find third thing", candidates); + + expect(reranked.map((candidate) => candidate.id)).toEqual(["first", "second", "third"]); + globalThis.fetch = fetchSpy; + }); }); From 8b044d932de24ca6fc35e412279ba4d1d3566f77 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:16:33 +0200 Subject: [PATCH 03/20] feat: improve external reranker payload quality --- README.md | 44 ++++++++++---------------- src/indexer/index.ts | 55 ++++++++++++++++++++++----------- tests/retrieval-ranking.test.ts | 53 ++++++++++++++++++++++++++----- 3 files changed, 98 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 5260db7..55085e5 100644 --- a/README.md +++ b/README.md @@ -532,18 +532,14 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde "rerankTopN": 20, // Deterministic rerank depth "contextLines": 0 // Extra lines before/after match }, - - // === Reranking API === "reranker": { - "enabled": true, // Enable API reranking - "baseUrl": "https://api.siliconflow.cn/v1", - "model": "BAAI/bge-reranker-v2-m3", - "apiKey": "{env:SILICONFLOW_API_KEY}", - "topN": 20, // Number of results to rerank - "timeoutMs": 30000 // Request timeout (ms) + "enabled": false, + "provider": "cohere", + "model": "rerank-v3.5", + "apiKey": "{env:RERANK_API_KEY}", + "topN": 15, + "timeoutMs": 10000 }, - - // === Debug === "debug": { "enabled": false, // Enable debug logging "logLevel": "info", // error | warn | info | debug @@ -604,23 +600,14 @@ String values in `codebase-index.json` can reference environment variables with | `rrfK` | `60` | RRF smoothing constant. Higher values flatten rank impact, lower values prioritize top-ranked candidates more strongly | | `rerankTopN` | `20` | Deterministic rerank depth cap. Applies lightweight name/path/chunk-type rerank to top-N only | | `contextLines` | `0` | Extra lines to include before/after each match | -| **reranker** | | | -| `reranker.enabled` | `false` | Enable API-based reranking | -| `reranker.baseUrl` | - | Rerank API endpoint URL | -| `reranker.model` | - | Reranking model name (e.g. `BAAI/bge-reranker-v2-m3`) | -| `reranker.apiKey` | - | API key for reranking service (use `{env:VAR}` for security) | -| `reranker.topN` | `20` | Number of top results to rerank via API | -| `reranker.timeoutMs` | `30000` | Rerank API request timeout in milliseconds | -| **customProvider** | | | -| `customProvider.baseUrl` | - | Base URL of OpenAI-compatible embeddings API (e.g. `https://api.siliconflow.cn/v1`) | -| `customProvider.model` | - | Model name (e.g. `BAAI/bge-m3`, `nomic-embed-text`) | -| `customProvider.dimensions` | - | Vector dimensions (e.g. `1024` for BGE-M3, `768` for nomic-embed-text) | -| `customProvider.apiKey` | - | API key (use `{env:VAR}` for security) | -| `customProvider.maxTokens` | `8192` | Max tokens per input text | -| `customProvider.timeoutMs` | `30000` | Request timeout in milliseconds | -| `customProvider.concurrency` | `3` | Max concurrent embedding requests | -| `customProvider.requestIntervalMs` | `1000` | Minimum delay between requests (ms). Set to `0` for local servers | -| `customProvider.maxBatchSize` | - | Max inputs per `/embeddings` request. Cap for servers with batch limits | +| **reranker** | | Optional second-stage model reranker for the top candidate pool | +| `enabled` | `false` | Turn external reranking on/off | +| `provider` | `"custom"` | Hosted shortcuts: `cohere`, `jina`, or `custom` | +| `model` | — | Reranker model name required when enabled | +| `baseUrl` | provider default | Override reranker endpoint base URL. `cohere` → `https://api.cohere.ai/v1`, `jina` → `https://api.jina.ai/v1` | +| `apiKey` | — | API key for hosted reranker providers | +| `topN` | `15` | Number of top candidates to send to the external reranker | +| `timeoutMs` | `10000` | Timeout for external rerank requests | | **debug** | | | | `enabled` | `false` | Enable debug logging and metrics collection | | `logLevel` | `"info"` | Log level: `error`, `warn`, `info`, `debug` | @@ -633,9 +620,10 @@ String values in `codebase-index.json` can reference environment variables with ### Retrieval ranking behavior -- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → filtering. +- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → optional external reranker (`reranker`) → filtering. - `find_similar` stays semantic-only: semantic retrieval + deterministic rerank only (no keyword retrieval, no RRF). - For compatibility rollbacks, set `search.fusionStrategy` to `"weighted"` to use the legacy weighted fusion path. +- When enabled, the external reranker sees path metadata plus a bounded on-disk code snippet for each candidate so it can distinguish real implementations from docs/tests more reliably. - Retrieval benchmark artifacts are separated by role: - baseline (versioned): `benchmarks/baselines/retrieval-baseline.json` - latest candidate run (generated): `benchmark-results/retrieval-candidate.json` diff --git a/src/indexer/index.ts b/src/indexer/index.ts index b1b2c31..bd66dd9 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -348,20 +348,6 @@ function splitNameTokens(name: string): Set { return tokens; } -function createRerankerDocumentText(candidate: RankedCandidate): string { - const parts = [ - `path: ${candidate.metadata.filePath}`, - `chunk_type: ${candidate.metadata.chunkType}`, - `language: ${candidate.metadata.language}`, - ]; - - if (candidate.metadata.name) { - parts.push(`name: ${candidate.metadata.name}`); - } - - return parts.join("\n"); -} - function chunkTypeBoost(chunkType: string): number { switch (chunkType) { case "function": @@ -1479,10 +1465,12 @@ export class Indexer { const topN = Math.min(reranker.topN, candidates.length); const head = candidates.slice(0, topN); const tail = candidates.slice(topN); - const documents = head.map((candidate) => ({ - id: candidate.id, - text: createRerankerDocumentText(candidate), - })); + const documents = await Promise.all( + head.map(async (candidate) => ({ + id: candidate.id, + text: await this.createRerankerDocumentText(candidate), + })) + ); try { const rankedIds = await this.callExternalReranker(query, documents, reranker); @@ -1575,6 +1563,37 @@ export class Indexer { } } + private async createRerankerDocumentText(candidate: RankedCandidate): Promise { + const parts = [ + `path: ${candidate.metadata.filePath}`, + `chunk_type: ${candidate.metadata.chunkType}`, + `language: ${candidate.metadata.language}`, + `lines: ${candidate.metadata.startLine}-${candidate.metadata.endLine}`, + ]; + + if (candidate.metadata.name) { + parts.push(`name: ${candidate.metadata.name}`); + } + + const intent = isLikelyImplementationPath(candidate.metadata.filePath) ? "implementation" : "doc_or_test"; + parts.push(`intent_hint: ${intent}`); + + try { + const fileContent = await fsPromises.readFile(candidate.metadata.filePath, "utf-8"); + const lines = fileContent.split("\n"); + const snippetStartLine = Math.max(1, candidate.metadata.startLine - 2); + const snippetEndLine = Math.min(lines.length, candidate.metadata.endLine + 2); + const snippet = lines.slice(snippetStartLine - 1, snippetEndLine).join("\n").trim(); + parts.push("snippet:"); + parts.push(snippet.length > 0 ? snippet : "[empty]"); + } catch { + parts.push("snippet:"); + parts.push("[unavailable]"); + } + + return parts.join("\n"); + } + async initialize(): Promise { if (this.config.embeddingProvider === 'custom') { if (!this.config.customProvider) { diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index dab710a..c286aad 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -1,4 +1,8 @@ -import { describe, expect, it } from "vitest"; +import * as fs from "fs"; +import * as os from "os"; +import * as path from "path"; + +import { afterEach, describe, expect, it } from "vitest"; import type { ChunkMetadata } from "../src/native/index.js"; import { @@ -16,6 +20,8 @@ import { parseConfig } from "../src/config/schema.js"; type Candidate = { id: string; score: number; metadata: ChunkMetadata }; +const tempDirs: string[] = []; + function meta(overrides: Partial): ChunkMetadata { return { filePath: "/repo/src/unknown.ts", @@ -28,7 +34,25 @@ function meta(overrides: Partial): ChunkMetadata { }; } +function createTempFile(relativePath: string, content: string): string { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "reranker-doc-")); + tempDirs.push(tempDir); + const filePath = path.join(tempDir, relativePath); + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, content, "utf-8"); + return filePath; +} + describe("retrieval ranking", () => { + afterEach(() => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) { + fs.rmSync(dir, { recursive: true, force: true }); + } + } + }); + it("fuses hybrid results using RRF rank ordering", () => { const semantic: Candidate[] = [ { id: "a", score: 0.91, metadata: meta({ filePath: "/repo/src/auth.ts", name: "validateAuth", chunkType: "function" }) }, @@ -354,9 +378,15 @@ describe("retrieval ranking", () => { }); const indexer = new Indexer("/repo", config); + const firstPath = createTempFile("src/first.ts", "export function firstThing() {\n return 'first';\n}\n"); + const secondPath = createTempFile("src/second.ts", "export function secondThing() {\n return 'second';\n}\n"); + const thirdPath = createTempFile("src/third.ts", "export function thirdThing() {\n return 'third';\n}\n"); + const fetchSpy = globalThis.fetch; - globalThis.fetch = (async (input) => { + let rerankBody: { documents?: string[] } | undefined; + globalThis.fetch = (async (input, init) => { if (String(input).includes("/rerank")) { + rerankBody = JSON.parse(String(init?.body ?? "{}")) as { documents?: string[] }; return new Response(JSON.stringify({ results: [ { index: 2, relevance_score: 0.99 }, @@ -369,9 +399,9 @@ describe("retrieval ranking", () => { }) as typeof fetch; const candidates: Candidate[] = [ - { id: "first", score: 0.9, metadata: meta({ filePath: "/repo/src/first.ts", name: "firstThing", chunkType: "function" }) }, - { id: "second", score: 0.89, metadata: meta({ filePath: "/repo/src/second.ts", name: "secondThing", chunkType: "function" }) }, - { id: "third", score: 0.88, metadata: meta({ filePath: "/repo/src/third.ts", name: "thirdThing", chunkType: "function" }) }, + { id: "first", score: 0.9, metadata: meta({ filePath: firstPath, name: "firstThing", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "second", score: 0.89, metadata: meta({ filePath: secondPath, name: "secondThing", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "third", score: 0.88, metadata: meta({ filePath: thirdPath, name: "thirdThing", chunkType: "function", startLine: 1, endLine: 3 }) }, ]; const reranked = await (indexer as unknown as { @@ -379,6 +409,9 @@ describe("retrieval ranking", () => { }).rerankCandidatesWithApi("find third thing", candidates); expect(reranked.map((candidate) => candidate.id)).toEqual(["third", "first", "second"]); + expect(rerankBody?.documents?.[0]).toContain("snippet:"); + expect(rerankBody?.documents?.[0]).toContain("export function firstThing()"); + expect(rerankBody?.documents?.[0]).toContain("intent_hint: implementation"); globalThis.fetch = fetchSpy; }); @@ -400,6 +433,10 @@ describe("retrieval ranking", () => { }); const indexer = new Indexer("/repo", config); + const firstPath = createTempFile("src/first.ts", "export function firstThing() {\n return 'first';\n}\n"); + const secondPath = createTempFile("src/second.ts", "export function secondThing() {\n return 'second';\n}\n"); + const thirdPath = createTempFile("src/third.ts", "export function thirdThing() {\n return 'third';\n}\n"); + const fetchSpy = globalThis.fetch; globalThis.fetch = (async (input) => { if (String(input).includes("/rerank")) { @@ -409,9 +446,9 @@ describe("retrieval ranking", () => { }) as typeof fetch; const candidates: Candidate[] = [ - { id: "first", score: 0.9, metadata: meta({ filePath: "/repo/src/first.ts", name: "firstThing", chunkType: "function" }) }, - { id: "second", score: 0.89, metadata: meta({ filePath: "/repo/src/second.ts", name: "secondThing", chunkType: "function" }) }, - { id: "third", score: 0.88, metadata: meta({ filePath: "/repo/src/third.ts", name: "thirdThing", chunkType: "function" }) }, + { id: "first", score: 0.9, metadata: meta({ filePath: firstPath, name: "firstThing", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "second", score: 0.89, metadata: meta({ filePath: secondPath, name: "secondThing", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "third", score: 0.88, metadata: meta({ filePath: thirdPath, name: "thirdThing", chunkType: "function", startLine: 1, endLine: 3 }) }, ]; const reranked = await (indexer as unknown as { From dfbf6e9a601d19e9f5c3923f3383904df22cb2cd Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:22:10 +0200 Subject: [PATCH 04/20] feat: guard external reranker intent ordering --- src/indexer/index.ts | 112 +++++++++++++++++++++----- tests/search-integration.test.ts | 132 +++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+), 21 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index bd66dd9..6294716 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -163,6 +163,8 @@ interface RerankDocumentPayload { text: string; } +type ExternalRerankBand = "implementation" | "documentation" | "test" | "other"; + interface HybridRankOptions { fusionStrategy: "weighted" | "rrf"; rrfK: number; @@ -388,6 +390,42 @@ function isLikelyImplementationPath(filePath: string): boolean { return true; } +function isDocumentationPath(filePath: string): boolean { + const lowered = filePath.toLowerCase(); + const ext = lowered.split(".").pop() ?? ""; + return lowered.includes("readme") || ["md", "mdx", "rst", "adoc", "txt"].includes(ext); +} + +function classifyExternalRerankBand( + candidate: RankedCandidate, + preferSourcePaths: boolean, + docIntent: boolean +): ExternalRerankBand { + const isDocOrTest = isTestOrDocPath(candidate.metadata.filePath); + const isDocumentation = isDocumentationPath(candidate.metadata.filePath); + const isImplementation = isLikelyImplementationPath(candidate.metadata.filePath) && + isImplementationChunkType(candidate.metadata.chunkType); + + if (preferSourcePaths) { + if (isImplementation) return "implementation"; + if (isDocumentation) return "documentation"; + if (isDocOrTest) return "test"; + return "other"; + } + + if (docIntent) { + if (isDocumentation) return "documentation"; + if (isImplementation) return "implementation"; + if (isDocOrTest) return "test"; + return "other"; + } + + if (isImplementation) return "implementation"; + if (isDocumentation) return "documentation"; + if (isDocOrTest) return "test"; + return "other"; +} + function classifyQueryIntent(tokens: string[]): "source" | "doc_test" { const sourceIntentHits = tokens.filter((t) => SOURCE_INTENT_HINTS.has(t)).length; const docTestIntentHits = tokens.filter((t) => DOC_TEST_INTENT_HINTS.has(t)).length; @@ -1462,39 +1500,71 @@ export class Indexer { return candidates; } + const queryTokens = Array.from(tokenizeTextForRanking(query)); + const preferSourcePaths = classifyQueryIntentRaw(query) === "source"; + const docIntent = classifyDocIntent(queryTokens) === "docs"; + const topN = Math.min(reranker.topN, candidates.length); const head = candidates.slice(0, topN); const tail = candidates.slice(topN); - const documents = await Promise.all( - head.map(async (candidate) => ({ - id: candidate.id, - text: await this.createRerankerDocumentText(candidate), - })) - ); + const grouped = new Map([ + ["implementation", []], + ["documentation", []], + ["test", []], + ["other", []], + ]); - try { - const rankedIds = await this.callExternalReranker(query, documents, reranker); - if (rankedIds.length === 0) { - return candidates; - } + for (const candidate of head) { + const band = classifyExternalRerankBand(candidate, preferSourcePaths, docIntent); + grouped.get(band)?.push(candidate); + } + + const orderedBands: ExternalRerankBand[] = preferSourcePaths + ? ["implementation", "other", "documentation", "test"] + : docIntent + ? ["documentation", "implementation", "other", "test"] + : ["implementation", "other", "documentation", "test"]; - const order = new Map(rankedIds.map((id, index) => [id, index])); - const rerankedHead = [...head].sort((a, b) => { - const aRank = order.get(a.id) ?? Number.MAX_SAFE_INTEGER; - const bRank = order.get(b.id) ?? Number.MAX_SAFE_INTEGER; - if (aRank !== bRank) { - return aRank - bRank; + try { + const rerankedHead: RankedCandidate[] = []; + for (const band of orderedBands) { + const bandCandidates = grouped.get(band) ?? []; + if (bandCandidates.length <= 1) { + rerankedHead.push(...bandCandidates); + continue; } - if (b.score !== a.score) { - return b.score - a.score; + + const documents = await Promise.all( + bandCandidates.map(async (candidate) => ({ + id: candidate.id, + text: await this.createRerankerDocumentText(candidate), + })) + ); + const rankedIds = await this.callExternalReranker(query, documents, reranker); + if (rankedIds.length === 0) { + rerankedHead.push(...bandCandidates); + continue; } - return a.id.localeCompare(b.id); - }); + + const order = new Map(rankedIds.map((id, index) => [id, index])); + rerankedHead.push(...[...bandCandidates].sort((a, b) => { + const aRank = order.get(a.id) ?? Number.MAX_SAFE_INTEGER; + const bRank = order.get(b.id) ?? Number.MAX_SAFE_INTEGER; + if (aRank !== bRank) { + return aRank - bRank; + } + if (b.score !== a.score) { + return b.score - a.score; + } + return a.id.localeCompare(b.id); + })); + } this.logger.search("debug", "Applied external reranker", { provider: reranker.provider, model: reranker.model, candidateCount: head.length, + bands: orderedBands, }); return [...rerankedHead, ...tail]; diff --git a/tests/search-integration.test.ts b/tests/search-integration.test.ts index df69ffd..b2bbc72 100644 --- a/tests/search-integration.test.ts +++ b/tests/search-integration.test.ts @@ -214,4 +214,136 @@ export function rerankResults(query: string) { return rankHybridResults(query); expect(withOverride[0]?.filePath).toContain("/app/indexer/index.ts"); expect(withOverride[0]?.filePath).not.toContain("/README.md"); }); + + it("keeps implementation results ahead of docs even when external reranker prefers docs for implementation intent", async () => { + fetchSpy.mockImplementation(async (url, init) => { + if (String(url).includes("/rerank")) { + return new Response(JSON.stringify({ + results: [ + { index: 0, relevance_score: 0.99 }, + { index: 1, relevance_score: 0.5 }, + ], + }), { status: 200 }); + } + + const body = JSON.parse(String(init?.body ?? "{}")) as { input?: string[] }; + const texts = Array.isArray(body.input) ? body.input : []; + const data = texts.map((text) => { + let seed = 0; + for (const ch of text) { + seed = (seed * 31 + ch.charCodeAt(0)) % 1000; + } + const embedding = Array.from({ length: 8 }, (_, idx) => ((seed + idx * 17) % 997) / 997); + return { embedding }; + }); + + return new Response(JSON.stringify({ + data, + usage: { total_tokens: Math.max(1, texts.length * 8) }, + }), { status: 200 }); + }); + + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embedding-model", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 10, + }, + indexing: { + watchFiles: false, + }, + search: { + maxResults: 10, + minScore: 0, + fusionStrategy: "rrf", + rrfK: 60, + rerankTopN: 20, + }, + }); + + const indexer = new Indexer(tempDir, config); + await indexer.index(); + + const results = await indexer.search("where is rankHybridResults implementation", 5, { + metadataOnly: true, + filterByBranch: false, + }); + + expect(results[0]?.filePath).toContain("/app/indexer/index.ts"); + expect(results[0]?.filePath).not.toContain("/README.md"); + }); + + it("keeps documentation results ahead of code when external reranker prefers code for doc intent", async () => { + fetchSpy.mockImplementation(async (url, init) => { + if (String(url).includes("/rerank")) { + return new Response(JSON.stringify({ + results: [ + { index: 1, relevance_score: 0.99 }, + { index: 0, relevance_score: 0.4 }, + ], + }), { status: 200 }); + } + + const body = JSON.parse(String(init?.body ?? "{}")) as { input?: string[] }; + const texts = Array.isArray(body.input) ? body.input : []; + const data = texts.map((text) => { + let seed = 0; + for (const ch of text) { + seed = (seed * 31 + ch.charCodeAt(0)) % 1000; + } + const embedding = Array.from({ length: 8 }, (_, idx) => ((seed + idx * 17) % 997) / 997); + return { embedding }; + }); + + return new Response(JSON.stringify({ + data, + usage: { total_tokens: Math.max(1, texts.length * 8) }, + }), { status: 200 }); + }); + + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embedding-model", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 10, + }, + indexing: { + watchFiles: false, + }, + search: { + maxResults: 10, + minScore: 0, + fusionStrategy: "rrf", + rrfK: 60, + rerankTopN: 20, + }, + }); + + const indexer = new Indexer(tempDir, config); + await indexer.index(); + + const results = await indexer.search("where is rankHybridResults documentation", 5, { + metadataOnly: true, + filterByBranch: false, + }); + + expect(results[0]?.filePath).toContain("/README.md"); + expect(results[0]?.filePath).not.toContain("/app/indexer/index.ts"); + }); }); From 6fcfcf50966fd65d2372e4ca4b46d0d769057469 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:25:10 +0200 Subject: [PATCH 05/20] feat: skip external reranking for exact definitions --- src/indexer/index.ts | 18 +++++++++-- tests/retrieval-ranking.test.ts | 55 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 6294716..9b096fd 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -1493,13 +1493,21 @@ export class Indexer { private async rerankCandidatesWithApi( query: string, - candidates: RankedCandidate[] + candidates: RankedCandidate[], + options?: { + definitionIntent?: boolean; + hasIdentifierHints?: boolean; + } ): Promise { const reranker = this.config.reranker; if (!reranker || !reranker.enabled || candidates.length <= 1) { return candidates; } + if (options?.definitionIntent === true || options?.hasIdentifierHints === true) { + return candidates; + } + const queryTokens = Array.from(tokenizeTextForRanking(query)); const preferSourcePaths = classifyQueryIntentRaw(query) === "source"; const docIntent = classifyDocIntent(queryTokens) === "docs"; @@ -2620,6 +2628,7 @@ export class Indexer { const rerankTopN = this.config.search.rerankTopN; const filterByBranch = options?.filterByBranch ?? true; const sourceIntent = options?.definitionIntent === true || classifyQueryIntentRaw(query) === "source"; + const identifierHints = extractIdentifierHints(query); this.logger.search("debug", "Starting search", { query, @@ -2693,7 +2702,10 @@ export class Indexer { hybridWeight, prioritizeSourcePaths: sourceIntent, }); - const rerankedCombined = await this.rerankCandidatesWithApi(query, combined); + const rerankedCombined = await this.rerankCandidatesWithApi(query, combined, { + definitionIntent: options?.definitionIntent === true, + hasIdentifierHints: identifierHints.length > 0, + }); const fusionMs = performance.now() - fusionStartTime; const rescued = promoteIdentifierMatches( @@ -2734,7 +2746,7 @@ export class Indexer { const prePrimaryLane = mergeTieredResults(deterministicIdentifierLane, identifierLane, maxResults * 4); const primaryLane = mergeTieredResults(prePrimaryLane, symbolLane, maxResults * 4); const tiered = mergeTieredResults(primaryLane, rescued, maxResults * 4); - const hasCodeHints = extractCodeTermHints(query).length > 0 || extractIdentifierHints(query).length > 0; + const hasCodeHints = extractCodeTermHints(query).length > 0 || identifierHints.length > 0; const baseFiltered = tiered.filter((r) => { if (r.score < this.config.search.minScore) return false; diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index c286aad..5bebc52 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -458,4 +458,59 @@ describe("retrieval ranking", () => { expect(reranked.map((candidate) => candidate.id)).toEqual(["first", "second", "third"]); globalThis.fetch = fetchSpy; }); + + it("skips external reranker for definition-intent queries with identifier hints", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 3, + }, + }); + const indexer = new Indexer("/repo", config); + + const fetchSpy = globalThis.fetch; + let rerankCalled = false; + globalThis.fetch = (async (input) => { + if (String(input).includes("/rerank")) { + rerankCalled = true; + return new Response(JSON.stringify({ + results: [ + { index: 2, relevance_score: 0.99 }, + { index: 0, relevance_score: 0.72 }, + { index: 1, relevance_score: 0.4 }, + ], + }), { status: 200 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "first", score: 0.9, metadata: meta({ filePath: "/repo/src/first.ts", name: "rankHybridResults", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "second", score: 0.89, metadata: meta({ filePath: "/repo/src/second.ts", name: "otherThing", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "third", score: 0.88, metadata: meta({ filePath: "/repo/README.md", name: "docs", chunkType: "other", startLine: 1, endLine: 3 }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi( + query: string, + items: Candidate[], + options?: { definitionIntent?: boolean; hasIdentifierHints?: boolean } + ): Promise; + }).rerankCandidatesWithApi("where is rankHybridResults implementation", candidates, { + hasIdentifierHints: true, + }); + + expect(rerankCalled).toBe(false); + expect(reranked.map((candidate) => candidate.id)).toEqual(["first", "second", "third"]); + globalThis.fetch = fetchSpy; + }); }); From eaac1e14c390b4c4f1e75ca6e14f18c93a181b48 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:28:38 +0200 Subject: [PATCH 06/20] feat: diversify exploratory rerank results --- src/indexer/index.ts | 41 ++++++++++++++++++++++++++++++++- tests/retrieval-ranking.test.ts | 22 ++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 9b096fd..7266c90 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -887,8 +887,47 @@ export function rerankResults( }); } + const shouldDiversify = !(preferSourcePaths && identifierHints.length > 0); + const diversifiedHead = shouldDiversify ? diversifyRerankedHead(head) : head; + const tail = candidates.slice(topN); - return [...head.map((entry) => entry.candidate), ...tail]; + return [...diversifiedHead.map((entry) => entry.candidate), ...tail]; +} + +function diversifyRerankedHead(head: T[]): T[] { + if (head.length <= 2) { + return head; + } + + const seenFiles = new Set(); + const firstPass: T[] = []; + const remainder: T[] = []; + + for (const entry of head) { + const filePath = entry.candidate.metadata.filePath; + if (!seenFiles.has(filePath)) { + seenFiles.add(filePath); + firstPass.push(entry); + } else { + remainder.push(entry); + } + } + + if (remainder.length === 0) { + return head; + } + + return [...firstPass, ...remainder].sort((a, b) => { + const aPrimary = firstPass.includes(a) ? 1 : 0; + const bPrimary = firstPass.includes(b) ? 1 : 0; + if (aPrimary !== bPrimary) { + return bPrimary - aPrimary; + } + return a.originalIndex - b.originalIndex; + }); } export function rankHybridResults( diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index 5bebc52..14e9cb7 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -106,6 +106,28 @@ describe("retrieval ranking", () => { expect(rerankedAgain.map(r => r.id)).toEqual(["exactName", "pathOverlap", "generic"]); }); + it("diversifies exploratory queries to avoid same-file duplicates dominating top results", () => { + const candidates: Candidate[] = [ + { id: "fileA-1", score: 0.96, metadata: meta({ filePath: "/repo/src/auth.ts", name: "validateAuth", chunkType: "function" }) }, + { id: "fileA-2", score: 0.95, metadata: meta({ filePath: "/repo/src/auth.ts", name: "refreshAuth", chunkType: "function" }) }, + { id: "fileB-1", score: 0.94, metadata: meta({ filePath: "/repo/src/session.ts", name: "loadSession", chunkType: "function" }) }, + ]; + + const reranked = rerankResults("auth flow", candidates, 10); + expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["fileA-1", "fileB-1"]); + }); + + it("does not diversify away exact-definition ranking for identifier queries", () => { + const candidates: Candidate[] = [ + { id: "target", score: 0.96, metadata: meta({ filePath: "/repo/src/auth.ts", name: "rankHybridResults", chunkType: "function" }) }, + { id: "same-file-secondary", score: 0.95, metadata: meta({ filePath: "/repo/src/auth.ts", name: "rankHybridResultsHelper", chunkType: "function" }) }, + { id: "other-file", score: 0.94, metadata: meta({ filePath: "/repo/src/session.ts", name: "loadSession", chunkType: "function" }) }, + ]; + + const reranked = rerankResults("where is rankHybridResults implementation", candidates, 10); + expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["target", "same-file-secondary"]); + }); + it("applies hybrid ranking path for search and semantic-only rerank for findSimilar", () => { const semantic: Candidate[] = [ { id: "s1", score: 0.95, metadata: meta({ filePath: "/repo/src/auth.ts", name: "auth", chunkType: "function" }) }, From fd3213b06162c08196e5864384984a1eb80c7bee Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:35:53 +0200 Subject: [PATCH 07/20] feat: diversify external reranker output --- src/indexer/index.ts | 28 +++++++++++++++-- tests/retrieval-ranking.test.ts | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 7266c90..1550bcd 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -930,6 +930,28 @@ function diversifyRerankedHead(); + const primary: RankedCandidate[] = []; + const remainder: RankedCandidate[] = []; + + for (const candidate of candidates) { + const filePath = candidate.metadata.filePath; + if (!seenFiles.has(filePath)) { + seenFiles.add(filePath); + primary.push(candidate); + } else { + remainder.push(candidate); + } + } + + return [...primary, ...remainder]; +} + export function rankHybridResults( query: string, semanticResults: RankedCandidate[], @@ -1594,7 +1616,7 @@ export class Indexer { } const order = new Map(rankedIds.map((id, index) => [id, index])); - rerankedHead.push(...[...bandCandidates].sort((a, b) => { + const bandReranked = [...bandCandidates].sort((a, b) => { const aRank = order.get(a.id) ?? Number.MAX_SAFE_INTEGER; const bRank = order.get(b.id) ?? Number.MAX_SAFE_INTEGER; if (aRank !== bRank) { @@ -1604,7 +1626,9 @@ export class Indexer { return b.score - a.score; } return a.id.localeCompare(b.id); - })); + }); + const shouldDiversifyBand = !options?.hasIdentifierHints; + rerankedHead.push(...diversifyCandidatesByFile(bandReranked, shouldDiversifyBand)); } this.logger.search("debug", "Applied external reranker", { diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index 14e9cb7..7a50717 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -535,4 +535,58 @@ describe("retrieval ranking", () => { expect(reranked.map((candidate) => candidate.id)).toEqual(["first", "second", "third"]); globalThis.fetch = fetchSpy; }); + + it("diversifies external reranker output for exploratory queries", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 3, + }, + }); + const indexer = new Indexer("/repo", config); + + const fileA1 = createTempFile("src/auth.ts", "export function validateAuth() {\n return true;\n}\n"); + const fileA2 = fileA1; + const fileB = createTempFile("src/session.ts", "export function loadSession() {\n return 'session';\n}\n"); + + const fetchSpy = globalThis.fetch; + globalThis.fetch = (async (input) => { + if (String(input).includes("/rerank")) { + return new Response(JSON.stringify({ + results: [ + { index: 0, relevance_score: 0.99 }, + { index: 1, relevance_score: 0.98 }, + { index: 2, relevance_score: 0.4 }, + ], + }), { status: 200 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "fileA-1", score: 0.95, metadata: meta({ filePath: fileA1, name: "validateAuth", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "fileA-2", score: 0.94, metadata: meta({ filePath: fileA2, name: "refreshAuth", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "fileB", score: 0.93, metadata: meta({ filePath: fileB, name: "loadSession", chunkType: "function", startLine: 1, endLine: 3 }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi( + query: string, + items: Candidate[], + options?: { definitionIntent?: boolean; hasIdentifierHints?: boolean } + ): Promise; + }).rerankCandidatesWithApi("auth flow", candidates); + + expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["fileA-1", "fileB"]); + globalThis.fetch = fetchSpy; + }); }); From d59b4699361a50f46a8c253fb8eb9c1baf5e9577 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:39:07 +0200 Subject: [PATCH 08/20] feat: suppress duplicate rerank results by symbol --- src/indexer/index.ts | 101 ++++++++++++++++++++------------ tests/retrieval-ranking.test.ts | 63 ++++++++++++++++++++ 2 files changed, 126 insertions(+), 38 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 1550bcd..29ea48f 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -888,70 +888,95 @@ export function rerankResults( } const shouldDiversify = !(preferSourcePaths && identifierHints.length > 0); - const diversifiedHead = shouldDiversify ? diversifyRerankedHead(head) : head; + const diversifiedHead = diversifyEntriesByFileAndSymbol(head, (entry) => entry.candidate, shouldDiversify); const tail = candidates.slice(topN); return [...diversifiedHead.map((entry) => entry.candidate), ...tail]; } -function diversifyRerankedHead(head: T[]): T[] { - if (head.length <= 2) { - return head; +function diversifyEntriesByFileAndSymbol( + entries: T[], + getCandidate: (entry: T) => RankedCandidate, + enabled: boolean +): T[] { + if (!enabled || entries.length <= 2) { + return entries; } - const seenFiles = new Set(); - const firstPass: T[] = []; - const remainder: T[] = []; + const groups = new Map(); + const groupOrder: string[] = []; - for (const entry of head) { - const filePath = entry.candidate.metadata.filePath; - if (!seenFiles.has(filePath)) { - seenFiles.add(filePath); - firstPass.push(entry); - } else { - remainder.push(entry); + for (const entry of entries) { + const candidate = getCandidate(entry); + const filePath = candidate.metadata.filePath; + if (!groups.has(filePath)) { + groups.set(filePath, []); + groupOrder.push(filePath); } + groups.get(filePath)?.push(entry); } - if (remainder.length === 0) { - return head; - } + const diversifiedGroups = groupOrder.map((filePath) => { + const group = groups.get(filePath) ?? []; + return diversifyGroupBySymbol(group, getCandidate); + }); - return [...firstPass, ...remainder].sort((a, b) => { - const aPrimary = firstPass.includes(a) ? 1 : 0; - const bPrimary = firstPass.includes(b) ? 1 : 0; - if (aPrimary !== bPrimary) { - return bPrimary - aPrimary; + const result: T[] = []; + let added = true; + let round = 0; + while (added) { + added = false; + for (const group of diversifiedGroups) { + const entry = group[round]; + if (entry !== undefined) { + result.push(entry); + added = true; + } } - return a.originalIndex - b.originalIndex; - }); + round += 1; + } + + return result; } function diversifyCandidatesByFile(candidates: RankedCandidate[], enabled: boolean): RankedCandidate[] { - if (!enabled || candidates.length <= 2) { - return candidates; + return diversifyEntriesByFileAndSymbol(candidates, (candidate) => candidate, enabled); +} + +function diversifyGroupBySymbol( + entries: T[], + getCandidate: (entry: T) => RankedCandidate +): T[] { + if (entries.length <= 2) { + return entries; } - const seenFiles = new Set(); - const primary: RankedCandidate[] = []; - const remainder: RankedCandidate[] = []; + const seenKeys = new Set(); + const primary: T[] = []; + const remainder: T[] = []; - for (const candidate of candidates) { - const filePath = candidate.metadata.filePath; - if (!seenFiles.has(filePath)) { - seenFiles.add(filePath); - primary.push(candidate); + for (const entry of entries) { + const key = buildDiversityKey(getCandidate(entry).metadata); + if (!seenKeys.has(key)) { + seenKeys.add(key); + primary.push(entry); } else { - remainder.push(candidate); + remainder.push(entry); } } return [...primary, ...remainder]; } +function buildDiversityKey(metadata: ChunkMetadata): string { + const normalizedPath = metadata.filePath.toLowerCase(); + const normalizedName = (metadata.name ?? "").trim().toLowerCase(); + if (normalizedName.length > 0) { + return `${normalizedPath}#${normalizedName}`; + } + return normalizedPath; +} + export function rankHybridResults( query: string, semanticResults: RankedCandidate[], diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index 7a50717..82ab623 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -117,6 +117,17 @@ describe("retrieval ranking", () => { expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["fileA-1", "fileB-1"]); }); + it("treats same-symbol duplicates as lower priority before distinct symbols", () => { + const candidates: Candidate[] = [ + { id: "same-symbol-1", score: 0.96, metadata: meta({ filePath: "/repo/src/auth.ts", name: "validateAuth", chunkType: "function" }) }, + { id: "same-symbol-2", score: 0.95, metadata: meta({ filePath: "/repo/src/auth.ts", name: "validateAuth", chunkType: "function" }) }, + { id: "different-symbol", score: 0.94, metadata: meta({ filePath: "/repo/src/auth.ts", name: "refreshAuth", chunkType: "function" }) }, + ]; + + const reranked = rerankResults("auth flow", candidates, 10); + expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["same-symbol-1", "different-symbol"]); + }); + it("does not diversify away exact-definition ranking for identifier queries", () => { const candidates: Candidate[] = [ { id: "target", score: 0.96, metadata: meta({ filePath: "/repo/src/auth.ts", name: "rankHybridResults", chunkType: "function" }) }, @@ -589,4 +600,56 @@ describe("retrieval ranking", () => { expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["fileA-1", "fileB"]); globalThis.fetch = fetchSpy; }); + + it("diversifies external reranker duplicates by symbol before repeating the same symbol", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 3, + }, + }); + const indexer = new Indexer("/repo", config); + + const authFile = createTempFile("src/auth.ts", "export function validateAuth() {\n return true;\n}\nexport function refreshAuth() {\n return false;\n}\n"); + + const fetchSpy = globalThis.fetch; + globalThis.fetch = (async (input) => { + if (String(input).includes("/rerank")) { + return new Response(JSON.stringify({ + results: [ + { index: 0, relevance_score: 0.99 }, + { index: 1, relevance_score: 0.98 }, + { index: 2, relevance_score: 0.4 }, + ], + }), { status: 200 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "same-symbol-1", score: 0.95, metadata: meta({ filePath: authFile, name: "validateAuth", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "same-symbol-2", score: 0.94, metadata: meta({ filePath: authFile, name: "validateAuth", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "different-symbol", score: 0.93, metadata: meta({ filePath: authFile, name: "refreshAuth", chunkType: "function", startLine: 4, endLine: 6 }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi( + query: string, + items: Candidate[], + options?: { definitionIntent?: boolean; hasIdentifierHints?: boolean } + ): Promise; + }).rerankCandidatesWithApi("auth flow", candidates); + + expect(reranked.map((candidate) => candidate.id).slice(0, 2)).toEqual(["same-symbol-1", "different-symbol"]); + globalThis.fetch = fetchSpy; + }); }); From c6c6bd5c60e3e6f0a375fb9051ef11a33deb2652 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:43:19 +0200 Subject: [PATCH 09/20] test: add reranker diversity benchmark coverage --- benchmarks/baselines/retrieval-baseline.json | 1 + tests/retrieval-benchmark.test.ts | 42 +++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/benchmarks/baselines/retrieval-baseline.json b/benchmarks/baselines/retrieval-baseline.json index 726d8df..2e83428 100644 --- a/benchmarks/baselines/retrieval-baseline.json +++ b/benchmarks/baselines/retrieval-baseline.json @@ -2,6 +2,7 @@ "generatedAt": "2026-03-13T14:21:43.213Z", "queryCount": 3, "hitAt5": 1, + "distinctTop3Ratio": 0.9166666666666666, "medianMs": 0.010916000000008808, "p95Ms": 0.024666000000024724 } diff --git a/tests/retrieval-benchmark.test.ts b/tests/retrieval-benchmark.test.ts index 4a9ef6a..45fc2c4 100644 --- a/tests/retrieval-benchmark.test.ts +++ b/tests/retrieval-benchmark.test.ts @@ -23,6 +23,7 @@ interface BenchmarkArtifact { generatedAt: string; queryCount: number; hitAt5: number; + distinctTop3Ratio: number; medianMs: number; p95Ms: number; } @@ -80,6 +81,26 @@ function computeHitAt5(queries: BenchmarkQuery[]): number { return queries.length === 0 ? 0 : hits / queries.length; } +function computeDistinctTop3Ratio(queries: BenchmarkQuery[]): number { + if (queries.length === 0) return 0; + + let totalRatio = 0; + for (const q of queries) { + const ranked = rankHybridResults(q.query, q.semantic, q.keyword, { + fusionStrategy: "rrf", + rrfK: 60, + rerankTopN: 20, + limit: 10, + hybridWeight: 0.5, + }); + const top3 = ranked.slice(0, 3); + const distinctFiles = new Set(top3.map((r) => r.metadata.filePath)).size; + totalRatio += distinctFiles / Math.max(1, top3.length); + } + + return totalRatio / queries.length; +} + function runLatency(queries: BenchmarkQuery[]): { medianMs: number; p95Ms: number } { const allSamples: LatencySample[] = []; const batchP95: number[] = []; @@ -187,16 +208,18 @@ function loadBaseline(): BenchmarkArtifact { const parsed = JSON.parse(raw) as Partial; if ( typeof parsed.hitAt5 !== "number" || + typeof parsed.distinctTop3Ratio !== "number" || typeof parsed.medianMs !== "number" || typeof parsed.p95Ms !== "number" ) { - throw new Error("retrieval-baseline.json is invalid: expected numeric hitAt5, medianMs, and p95Ms"); + throw new Error("retrieval-baseline.json is invalid: expected numeric hitAt5, distinctTop3Ratio, medianMs, and p95Ms"); } return { generatedAt: typeof parsed.generatedAt === "string" ? parsed.generatedAt : new Date(0).toISOString(), queryCount: typeof parsed.queryCount === "number" ? parsed.queryCount : 0, hitAt5: parsed.hitAt5, + distinctTop3Ratio: parsed.distinctTop3Ratio, medianMs: parsed.medianMs, p95Ms: parsed.p95Ms, }; @@ -242,15 +265,31 @@ describe("retrieval benchmark", () => { { id: "k-doc", score: 10, metadata: meta("/repo/README.md", "find similar", "other") }, ], }, + { + query: "auth flow exploration", + expectedTop5: ["/repo/src/auth.ts", "/repo/src/session.ts"], + semantic: [ + { id: "s-auth-1", score: 0.96, metadata: meta("/repo/src/auth.ts", "validateAuth") }, + { id: "s-auth-2", score: 0.95, metadata: meta("/repo/src/auth.ts", "refreshAuth") }, + { id: "s-session", score: 0.94, metadata: meta("/repo/src/session.ts", "loadSession") }, + ], + keyword: [ + { id: "s-auth-1", score: 25, metadata: meta("/repo/src/auth.ts", "validateAuth") }, + { id: "s-auth-2", score: 24, metadata: meta("/repo/src/auth.ts", "refreshAuth") }, + { id: "s-session", score: 10, metadata: meta("/repo/src/session.ts", "loadSession") }, + ], + }, ]; const hitAt5 = computeHitAt5(queries); + const distinctTop3Ratio = computeDistinctTop3Ratio(queries); const latency = runLatency(queries); const candidate: BenchmarkArtifact = { generatedAt: new Date().toISOString(), queryCount: queries.length, hitAt5, + distinctTop3Ratio, medianMs: latency.medianMs, p95Ms: latency.p95Ms, }; @@ -261,6 +300,7 @@ describe("retrieval benchmark", () => { const baseline = loadBaseline(); expect(candidate.hitAt5).toBeGreaterThanOrEqual(baseline.hitAt5); + expect(candidate.distinctTop3Ratio).toBeGreaterThanOrEqual(baseline.distinctTop3Ratio); const medianBudget = Math.max( baseline.medianMs * 1.15 + LATENCY_BUDGET_ABSOLUTE_JITTER_MS, LATENCY_BUDGET_MEDIAN_MIN_MS From b9409afba0f75bf77e32eee6628d24c22d1dd2f7 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:49:08 +0200 Subject: [PATCH 10/20] feat: track eval distinct top-k diversity --- src/eval/compare.ts | 1 + src/eval/metrics.ts | 10 ++++++++++ src/eval/reports.ts | 4 ++++ src/eval/types.ts | 2 ++ 4 files changed, 17 insertions(+) diff --git a/src/eval/compare.ts b/src/eval/compare.ts index 989fc0f..6c5f97d 100644 --- a/src/eval/compare.ts +++ b/src/eval/compare.ts @@ -21,6 +21,7 @@ export function compareSummaries(current: EvalSummary, baseline: EvalSummary, ag hitAt10: metricDelta(current.metrics.hitAt10, baseline.metrics.hitAt10), mrrAt10: metricDelta(current.metrics.mrrAt10, baseline.metrics.mrrAt10), ndcgAt10: metricDelta(current.metrics.ndcgAt10, baseline.metrics.ndcgAt10), + distinctTop3Ratio: metricDelta(current.metrics.distinctTop3Ratio, baseline.metrics.distinctTop3Ratio), latencyP50Ms: metricDelta(current.metrics.latencyMs.p50, baseline.metrics.latencyMs.p50), latencyP95Ms: metricDelta(current.metrics.latencyMs.p95, baseline.metrics.latencyMs.p95), latencyP99Ms: metricDelta(current.metrics.latencyMs.p99, baseline.metrics.latencyMs.p99), diff --git a/src/eval/metrics.ts b/src/eval/metrics.ts index c4ea8bf..c6c52a3 100644 --- a/src/eval/metrics.ts +++ b/src/eval/metrics.ts @@ -39,6 +39,13 @@ function uniqueResultsByPath(results: PerQueryEvalResult["results"]): PerQueryEv return unique; } +function distinctTopKRatio(results: PerQueryEvalResult["results"], k: number): number { + const top = results.slice(0, k); + if (top.length === 0) return 0; + const distinct = new Set(top.map((result) => normalizePath(result.filePath))).size; + return distinct / top.length; +} + export function pathMatchesExpected(actualPath: string, expectedPath: string): boolean { const actual = normalizePath(actualPath); const expected = normalizePath(expectedPath); @@ -172,6 +179,7 @@ export function computeEvalMetrics( hitAt10: 0, mrrAt10: 0, ndcgAt10: 0, + distinctTop3Ratio: 0, }; const failureBuckets: Record = { @@ -190,6 +198,7 @@ export function computeEvalMetrics( if (query.hitAt10) sum.hitAt10 += 1; sum.mrrAt10 += query.reciprocalRankAt10; sum.ndcgAt10 += query.ndcgAt10; + sum.distinctTop3Ratio += distinctTopKRatio(query.results, 3); if (query.failureBucket) { failureBuckets[query.failureBucket] += 1; } @@ -204,6 +213,7 @@ export function computeEvalMetrics( hitAt10: safeDiv(sum.hitAt10), mrrAt10: safeDiv(sum.mrrAt10), ndcgAt10: safeDiv(sum.ndcgAt10), + distinctTop3Ratio: safeDiv(sum.distinctTop3Ratio), latencyMs: { p50: percentile(latencies, 0.5), p95: percentile(latencies, 0.95), diff --git a/src/eval/reports.ts b/src/eval/reports.ts index a0b1a9f..3bc6807 100644 --- a/src/eval/reports.ts +++ b/src/eval/reports.ts @@ -74,6 +74,7 @@ export function createSummaryMarkdown( lines.push(`| Hit@10 | ${formatPct(summary.metrics.hitAt10)} |`); lines.push(`| MRR@10 | ${summary.metrics.mrrAt10.toFixed(4)} |`); lines.push(`| nDCG@10 | ${summary.metrics.ndcgAt10.toFixed(4)} |`); + lines.push(`| Distinct Top@3 | ${formatPct(summary.metrics.distinctTop3Ratio)} |`); lines.push(`| Latency p50 | ${formatMs(summary.metrics.latencyMs.p50)} |`); lines.push(`| Latency p95 | ${formatMs(summary.metrics.latencyMs.p95)} |`); lines.push(`| Latency p99 | ${formatMs(summary.metrics.latencyMs.p99)} |`); @@ -116,6 +117,9 @@ export function createSummaryMarkdown( lines.push( `| nDCG@10 | ${comparison.deltas.ndcgAt10.baseline.toFixed(4)} | ${comparison.deltas.ndcgAt10.current.toFixed(4)} | ${signed(comparison.deltas.ndcgAt10.absolute)} |` ); + lines.push( + `| Distinct Top@3 | ${formatPct(comparison.deltas.distinctTop3Ratio.baseline)} | ${formatPct(comparison.deltas.distinctTop3Ratio.current)} | ${signed(comparison.deltas.distinctTop3Ratio.absolute)} |` + ); lines.push( `| p95 latency (ms) | ${comparison.deltas.latencyP95Ms.baseline.toFixed(3)} | ${comparison.deltas.latencyP95Ms.current.toFixed(3)} | ${signed(comparison.deltas.latencyP95Ms.absolute, 3)} |` ); diff --git a/src/eval/types.ts b/src/eval/types.ts index 30b51ef..7b28cf0 100644 --- a/src/eval/types.ts +++ b/src/eval/types.ts @@ -78,6 +78,7 @@ export interface EvalMetrics { hitAt10: number; mrrAt10: number; ndcgAt10: number; + distinctTop3Ratio: number; latencyMs: { p50: number; p95: number; @@ -123,6 +124,7 @@ export interface EvalComparison { hitAt10: MetricDelta; mrrAt10: MetricDelta; ndcgAt10: MetricDelta; + distinctTop3Ratio: MetricDelta; latencyP50Ms: MetricDelta; latencyP95Ms: MetricDelta; latencyP99Ms: MetricDelta; From dc49b1a85c1b91c5b81a907f26eb4df62ad157ac Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:49:19 +0200 Subject: [PATCH 11/20] test: cover eval distinct top-k diversity --- tests/eval-metrics.test.ts | 41 ++++++++++++++++++++++++++++++++++++++ tests/eval-runner.test.ts | 3 +++ 2 files changed, 44 insertions(+) diff --git a/tests/eval-metrics.test.ts b/tests/eval-metrics.test.ts index 948d954..32b7413 100644 --- a/tests/eval-metrics.test.ts +++ b/tests/eval-metrics.test.ts @@ -160,11 +160,52 @@ describe("eval metrics", () => { expect(metrics.hitAt1).toBe(0.5); expect(metrics.hitAt3).toBe(1); expect(metrics.mrrAt10).toBeCloseTo(0.75, 5); + expect(metrics.distinctTop3Ratio).toBe(1); expect(metrics.latencyMs.p50).toBeGreaterThan(0); expect(metrics.embedding.callCount).toBe(20); expect(metrics.embedding.estimatedCostUsd).toBeCloseTo(0.00002, 8); }); + it("tracks distinctTop3Ratio on per-query eval output", () => { + const queries: GoldenQuery[] = [query({ id: "q-dup" })]; + const perQuery = [ + buildPerQueryResult( + queries[0], + [ + { + filePath: "/repo/src/indexer/index.ts", + startLine: 1, + endLine: 2, + score: 1, + chunkType: "function", + name: "rankHybridResults", + }, + { + filePath: "/repo/src/indexer/index.ts", + startLine: 10, + endLine: 20, + score: 0.95, + chunkType: "function", + name: "rerankResults", + }, + { + filePath: "/repo/src/tools/index.ts", + startLine: 1, + endLine: 2, + score: 0.9, + chunkType: "function", + name: "codebase_search", + }, + ], + 10, + 10 + ), + ]; + + const metrics = computeEvalMetrics(queries, perQuery, 0, 0, 0); + expect(metrics.distinctTop3Ratio).toBe(1); + }); + it("uses deterministic percentile behavior for tiny samples", () => { const q = query(); const build = (id: string, latencyMs: number) => diff --git a/tests/eval-runner.test.ts b/tests/eval-runner.test.ts index 60a7d4e..e1739ef 100644 --- a/tests/eval-runner.test.ts +++ b/tests/eval-runner.test.ts @@ -121,7 +121,9 @@ describe("eval runner", () => { }); expect(result.summary.queryCount).toBe(1); + expect(typeof result.summary.metrics.distinctTop3Ratio).toBe("number"); expect(readFileSync(path.join(result.outputDir, "summary.json"), "utf-8")).toContain("\"metrics\""); + expect(readFileSync(path.join(result.outputDir, "summary.md"), "utf-8")).toContain("Distinct Top@3"); expect(readFileSync(path.join(result.outputDir, "summary.md"), "utf-8")).toContain("# Evaluation Summary"); expect(readFileSync(path.join(result.outputDir, "per-query.json"), "utf-8")).toContain("\"queries\""); }); @@ -152,6 +154,7 @@ describe("eval runner", () => { }); expect(compareRun.comparison).toBeDefined(); + expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"distinctTop3Ratio\""); expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"deltas\""); }); From ad0b70f822a5fcbbbdd68c086b8f39263c068972 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:54:14 +0200 Subject: [PATCH 12/20] feat: track raw eval top-k diversity --- src/eval/compare.ts | 1 + src/eval/metrics.ts | 4 ++++ src/eval/reports.ts | 4 ++++ src/eval/types.ts | 3 +++ 4 files changed, 12 insertions(+) diff --git a/src/eval/compare.ts b/src/eval/compare.ts index 6c5f97d..c648f3e 100644 --- a/src/eval/compare.ts +++ b/src/eval/compare.ts @@ -22,6 +22,7 @@ export function compareSummaries(current: EvalSummary, baseline: EvalSummary, ag mrrAt10: metricDelta(current.metrics.mrrAt10, baseline.metrics.mrrAt10), ndcgAt10: metricDelta(current.metrics.ndcgAt10, baseline.metrics.ndcgAt10), distinctTop3Ratio: metricDelta(current.metrics.distinctTop3Ratio, baseline.metrics.distinctTop3Ratio), + rawDistinctTop3Ratio: metricDelta(current.metrics.rawDistinctTop3Ratio, baseline.metrics.rawDistinctTop3Ratio), latencyP50Ms: metricDelta(current.metrics.latencyMs.p50, baseline.metrics.latencyMs.p50), latencyP95Ms: metricDelta(current.metrics.latencyMs.p95, baseline.metrics.latencyMs.p95), latencyP99Ms: metricDelta(current.metrics.latencyMs.p99, baseline.metrics.latencyMs.p99), diff --git a/src/eval/metrics.ts b/src/eval/metrics.ts index c6c52a3..3693d64 100644 --- a/src/eval/metrics.ts +++ b/src/eval/metrics.ts @@ -156,6 +156,7 @@ export function buildPerQueryResult( reciprocalRankAt10: reciprocalRankAtK(deduped, relevantPaths, 10), ndcgAt10: ndcgAtK(deduped, relevantPaths, 10), failureBucket: classifyFailureBucket(query, results, k), + rawTop3DistinctRatio: distinctTopKRatio(results, 3), results: deduped, }; @@ -180,6 +181,7 @@ export function computeEvalMetrics( mrrAt10: 0, ndcgAt10: 0, distinctTop3Ratio: 0, + rawDistinctTop3Ratio: 0, }; const failureBuckets: Record = { @@ -199,6 +201,7 @@ export function computeEvalMetrics( sum.mrrAt10 += query.reciprocalRankAt10; sum.ndcgAt10 += query.ndcgAt10; sum.distinctTop3Ratio += distinctTopKRatio(query.results, 3); + sum.rawDistinctTop3Ratio += query.rawTop3DistinctRatio; if (query.failureBucket) { failureBuckets[query.failureBucket] += 1; } @@ -214,6 +217,7 @@ export function computeEvalMetrics( mrrAt10: safeDiv(sum.mrrAt10), ndcgAt10: safeDiv(sum.ndcgAt10), distinctTop3Ratio: safeDiv(sum.distinctTop3Ratio), + rawDistinctTop3Ratio: safeDiv(sum.rawDistinctTop3Ratio), latencyMs: { p50: percentile(latencies, 0.5), p95: percentile(latencies, 0.95), diff --git a/src/eval/reports.ts b/src/eval/reports.ts index 3bc6807..f55c21c 100644 --- a/src/eval/reports.ts +++ b/src/eval/reports.ts @@ -75,6 +75,7 @@ export function createSummaryMarkdown( lines.push(`| MRR@10 | ${summary.metrics.mrrAt10.toFixed(4)} |`); lines.push(`| nDCG@10 | ${summary.metrics.ndcgAt10.toFixed(4)} |`); lines.push(`| Distinct Top@3 | ${formatPct(summary.metrics.distinctTop3Ratio)} |`); + lines.push(`| Raw Distinct Top@3 | ${formatPct(summary.metrics.rawDistinctTop3Ratio)} |`); lines.push(`| Latency p50 | ${formatMs(summary.metrics.latencyMs.p50)} |`); lines.push(`| Latency p95 | ${formatMs(summary.metrics.latencyMs.p95)} |`); lines.push(`| Latency p99 | ${formatMs(summary.metrics.latencyMs.p99)} |`); @@ -120,6 +121,9 @@ export function createSummaryMarkdown( lines.push( `| Distinct Top@3 | ${formatPct(comparison.deltas.distinctTop3Ratio.baseline)} | ${formatPct(comparison.deltas.distinctTop3Ratio.current)} | ${signed(comparison.deltas.distinctTop3Ratio.absolute)} |` ); + lines.push( + `| Raw Distinct Top@3 | ${formatPct(comparison.deltas.rawDistinctTop3Ratio.baseline)} | ${formatPct(comparison.deltas.rawDistinctTop3Ratio.current)} | ${signed(comparison.deltas.rawDistinctTop3Ratio.absolute)} |` + ); lines.push( `| p95 latency (ms) | ${comparison.deltas.latencyP95Ms.baseline.toFixed(3)} | ${comparison.deltas.latencyP95Ms.current.toFixed(3)} | ${signed(comparison.deltas.latencyP95Ms.absolute, 3)} |` ); diff --git a/src/eval/types.ts b/src/eval/types.ts index 7b28cf0..72ae3f6 100644 --- a/src/eval/types.ts +++ b/src/eval/types.ts @@ -68,6 +68,7 @@ export interface PerQueryEvalResult { reciprocalRankAt10: number; ndcgAt10: number; failureBucket?: FailureBucket; + rawTop3DistinctRatio: number; results: EvalSearchResult[]; } @@ -79,6 +80,7 @@ export interface EvalMetrics { mrrAt10: number; ndcgAt10: number; distinctTop3Ratio: number; + rawDistinctTop3Ratio: number; latencyMs: { p50: number; p95: number; @@ -125,6 +127,7 @@ export interface EvalComparison { mrrAt10: MetricDelta; ndcgAt10: MetricDelta; distinctTop3Ratio: MetricDelta; + rawDistinctTop3Ratio: MetricDelta; latencyP50Ms: MetricDelta; latencyP95Ms: MetricDelta; latencyP99Ms: MetricDelta; From dba17fb8dc3015012a29033b2127c858937990df Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:54:26 +0200 Subject: [PATCH 13/20] test: cover raw eval top-k diversity --- tests/eval-metrics.test.ts | 5 ++++- tests/eval-runner.test.ts | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/eval-metrics.test.ts b/tests/eval-metrics.test.ts index 32b7413..f500c96 100644 --- a/tests/eval-metrics.test.ts +++ b/tests/eval-metrics.test.ts @@ -161,12 +161,13 @@ describe("eval metrics", () => { expect(metrics.hitAt3).toBe(1); expect(metrics.mrrAt10).toBeCloseTo(0.75, 5); expect(metrics.distinctTop3Ratio).toBe(1); + expect(metrics.rawDistinctTop3Ratio).toBe(1); expect(metrics.latencyMs.p50).toBeGreaterThan(0); expect(metrics.embedding.callCount).toBe(20); expect(metrics.embedding.estimatedCostUsd).toBeCloseTo(0.00002, 8); }); - it("tracks distinctTop3Ratio on per-query eval output", () => { + it("tracks deduped and raw distinctTop3 ratios separately", () => { const queries: GoldenQuery[] = [query({ id: "q-dup" })]; const perQuery = [ buildPerQueryResult( @@ -204,6 +205,8 @@ describe("eval metrics", () => { const metrics = computeEvalMetrics(queries, perQuery, 0, 0, 0); expect(metrics.distinctTop3Ratio).toBe(1); + expect(metrics.rawDistinctTop3Ratio).toBeCloseTo(2 / 3, 6); + expect(perQuery[0].rawTop3DistinctRatio).toBeCloseTo(2 / 3, 6); }); it("uses deterministic percentile behavior for tiny samples", () => { diff --git a/tests/eval-runner.test.ts b/tests/eval-runner.test.ts index e1739ef..558201b 100644 --- a/tests/eval-runner.test.ts +++ b/tests/eval-runner.test.ts @@ -122,8 +122,10 @@ describe("eval runner", () => { expect(result.summary.queryCount).toBe(1); expect(typeof result.summary.metrics.distinctTop3Ratio).toBe("number"); + expect(typeof result.summary.metrics.rawDistinctTop3Ratio).toBe("number"); expect(readFileSync(path.join(result.outputDir, "summary.json"), "utf-8")).toContain("\"metrics\""); expect(readFileSync(path.join(result.outputDir, "summary.md"), "utf-8")).toContain("Distinct Top@3"); + expect(readFileSync(path.join(result.outputDir, "summary.md"), "utf-8")).toContain("Raw Distinct Top@3"); expect(readFileSync(path.join(result.outputDir, "summary.md"), "utf-8")).toContain("# Evaluation Summary"); expect(readFileSync(path.join(result.outputDir, "per-query.json"), "utf-8")).toContain("\"queries\""); }); @@ -155,6 +157,7 @@ describe("eval runner", () => { expect(compareRun.comparison).toBeDefined(); expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"distinctTop3Ratio\""); + expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"rawDistinctTop3Ratio\""); expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"deltas\""); }); From 481b29b20c8fdcfe592f092e35fb87139208cd2e Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:59:31 +0200 Subject: [PATCH 14/20] feat: gate eval raw top-k diversity --- src/eval/budget.ts | 20 ++++++++++++++++++++ src/eval/schema.ts | 14 ++++++++++++++ src/eval/types.ts | 2 ++ 3 files changed, 36 insertions(+) diff --git a/src/eval/budget.ts b/src/eval/budget.ts index 6a61b60..d558cd9 100644 --- a/src/eval/budget.ts +++ b/src/eval/budget.ts @@ -24,6 +24,16 @@ export function evaluateBudgetGate( }); } + if ( + thresholds.minRawDistinctTop3Ratio !== undefined && + summary.metrics.rawDistinctTop3Ratio < thresholds.minRawDistinctTop3Ratio + ) { + violations.push({ + metric: "minRawDistinctTop3Ratio", + message: `Raw Distinct Top@3 ${summary.metrics.rawDistinctTop3Ratio.toFixed(4)} is below minimum ${thresholds.minRawDistinctTop3Ratio.toFixed(4)}`, + }); + } + if (comparison) { if ( thresholds.hitAt5MaxDrop !== undefined && @@ -45,6 +55,16 @@ export function evaluateBudgetGate( }); } + if ( + thresholds.rawDistinctTop3RatioMaxDrop !== undefined && + comparison.deltas.rawDistinctTop3Ratio.absolute < -thresholds.rawDistinctTop3RatioMaxDrop + ) { + violations.push({ + metric: "rawDistinctTop3RatioMaxDrop", + message: `Raw Distinct Top@3 drop ${comparison.deltas.rawDistinctTop3Ratio.absolute.toFixed(4)} exceeds allowed -${thresholds.rawDistinctTop3RatioMaxDrop.toFixed(4)}`, + }); + } + if (thresholds.p95LatencyMaxMultiplier !== undefined) { const baselineP95 = comparison.deltas.latencyP95Ms.baseline; if (baselineP95 > BASELINE_P95_EPSILON_MS) { diff --git a/src/eval/schema.ts b/src/eval/schema.ts index 4293989..9bc0d67 100644 --- a/src/eval/schema.ts +++ b/src/eval/schema.ts @@ -195,6 +195,13 @@ export function parseBudget(raw: unknown, sourceLabel: string): EvalBudget { thresholds.mrrAt10MaxDrop === undefined ? undefined : asPositiveNumber(thresholds.mrrAt10MaxDrop, `${sourceLabel}.thresholds.mrrAt10MaxDrop`), + rawDistinctTop3RatioMaxDrop: + thresholds.rawDistinctTop3RatioMaxDrop === undefined + ? undefined + : asPositiveNumber( + thresholds.rawDistinctTop3RatioMaxDrop, + `${sourceLabel}.thresholds.rawDistinctTop3RatioMaxDrop` + ), p95LatencyMaxMultiplier: thresholds.p95LatencyMaxMultiplier === undefined ? undefined @@ -217,6 +224,13 @@ export function parseBudget(raw: unknown, sourceLabel: string): EvalBudget { thresholds.minMrrAt10 === undefined ? undefined : asPositiveNumber(thresholds.minMrrAt10, `${sourceLabel}.thresholds.minMrrAt10`), + minRawDistinctTop3Ratio: + thresholds.minRawDistinctTop3Ratio === undefined + ? undefined + : asPositiveNumber( + thresholds.minRawDistinctTop3Ratio, + `${sourceLabel}.thresholds.minRawDistinctTop3Ratio` + ), }, }; } diff --git a/src/eval/types.ts b/src/eval/types.ts index 72ae3f6..43b5fde 100644 --- a/src/eval/types.ts +++ b/src/eval/types.ts @@ -34,10 +34,12 @@ export interface EvalBudget { thresholds: { hitAt5MaxDrop?: number; mrrAt10MaxDrop?: number; + rawDistinctTop3RatioMaxDrop?: number; p95LatencyMaxMultiplier?: number; p95LatencyMaxAbsoluteMs?: number; minHitAt5?: number; minMrrAt10?: number; + minRawDistinctTop3Ratio?: number; }; } From 44caf10c95453d3c47d1cf8939651c72473fcf45 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 21:59:44 +0200 Subject: [PATCH 15/20] test: cover raw diversity budget gating --- tests/eval-budget.test.ts | 64 +++++++++++++++++++++++++++++++++++++++ tests/eval-schema.test.ts | 6 +++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tests/eval-budget.test.ts b/tests/eval-budget.test.ts index d4b8c8c..f77b81f 100644 --- a/tests/eval-budget.test.ts +++ b/tests/eval-budget.test.ts @@ -25,6 +25,8 @@ function summary(p95: number): EvalSummary { hitAt10: 1, mrrAt10: 1, ndcgAt10: 1, + distinctTop3Ratio: 1, + rawDistinctTop3Ratio: 1, latencyMs: { p50: p95, p95, @@ -59,6 +61,8 @@ function comparisonWithBaselineP95(baselineP95: number): EvalComparison { hitAt10: { current: 1, baseline: 1, absolute: 0, relativePct: 0 }, mrrAt10: { current: 1, baseline: 1, absolute: 0, relativePct: 0 }, ndcgAt10: { current: 1, baseline: 1, absolute: 0, relativePct: 0 }, + distinctTop3Ratio: { current: 1, baseline: 1, absolute: 0, relativePct: 0 }, + rawDistinctTop3Ratio: { current: 1, baseline: 1, absolute: 0, relativePct: 0 }, latencyP50Ms: { current: 5, baseline: baselineP95, absolute: 5 - baselineP95, relativePct: 0 }, latencyP95Ms: { current: 5, baseline: baselineP95, absolute: 5 - baselineP95, relativePct: 0 }, latencyP99Ms: { current: 5, baseline: baselineP95, absolute: 5 - baselineP95, relativePct: 0 }, @@ -97,4 +101,64 @@ describe("eval budget gate", () => { expect(gate.passed).toBe(false); expect(gate.violations.some((v) => v.metric === "p95LatencyMaxAbsoluteMs")).toBe(true); }); + + it("fails when raw distinct top3 ratio drops below minimum", () => { + const budget: EvalBudget = { + name: "default", + failOnMissingBaseline: true, + thresholds: { + minRawDistinctTop3Ratio: 0.9, + }, + }; + + const gate = evaluateBudgetGate( + budget, + { + ...summary(5), + metrics: { + ...summary(5).metrics, + rawDistinctTop3Ratio: 0.5, + }, + } + ); + expect(gate.passed).toBe(false); + expect(gate.violations.some((v) => v.metric === "minRawDistinctTop3Ratio")).toBe(true); + }); + + it("fails when raw distinct top3 ratio regresses beyond allowed drop", () => { + const budget: EvalBudget = { + name: "default", + failOnMissingBaseline: true, + thresholds: { + rawDistinctTop3RatioMaxDrop: 0.1, + }, + }; + + const comparison: EvalComparison = { + ...comparisonWithBaselineP95(5), + deltas: { + ...comparisonWithBaselineP95(5).deltas, + rawDistinctTop3Ratio: { + current: 0.6, + baseline: 0.8, + absolute: -0.2, + relativePct: -25, + }, + }, + }; + + const gate = evaluateBudgetGate( + budget, + { + ...summary(5), + metrics: { + ...summary(5).metrics, + rawDistinctTop3Ratio: 0.6, + }, + }, + comparison + ); + expect(gate.passed).toBe(false); + expect(gate.violations.some((v) => v.metric === "rawDistinctTop3RatioMaxDrop")).toBe(true); + }); }); diff --git a/tests/eval-schema.test.ts b/tests/eval-schema.test.ts index 399b2ea..8c149da 100644 --- a/tests/eval-schema.test.ts +++ b/tests/eval-schema.test.ts @@ -82,13 +82,17 @@ describe("eval schema", () => { thresholds: { hitAt5MaxDrop: 0.05, mrrAt10MaxDrop: 0.02, + rawDistinctTop3RatioMaxDrop: 0.1, p95LatencyMaxMultiplier: 1.5, + minRawDistinctTop3Ratio: 0.7, }, }, "budget.json" ); expect(budget.thresholds.hitAt5MaxDrop).toBe(0.05); + expect(budget.thresholds.rawDistinctTop3RatioMaxDrop).toBe(0.1); + expect(budget.thresholds.minRawDistinctTop3Ratio).toBe(0.7); expect(budget.failOnMissingBaseline).toBe(true); }); @@ -98,7 +102,7 @@ describe("eval schema", () => { { name: "default", thresholds: { - hitAt5MaxDrop: "bad", + rawDistinctTop3RatioMaxDrop: "bad", }, }, "budget.json" From 3694b4c18f85528f6cad6065a26b0f6cd05a0cc1 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 22:01:38 +0200 Subject: [PATCH 16/20] test: configure raw diversity eval budgets --- benchmarks/baselines/eval-baseline-summary.json | 2 ++ benchmarks/budgets/default.json | 4 +++- benchmarks/budgets/github-models.json | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/baselines/eval-baseline-summary.json b/benchmarks/baselines/eval-baseline-summary.json index 7efdc86..68d3371 100644 --- a/benchmarks/baselines/eval-baseline-summary.json +++ b/benchmarks/baselines/eval-baseline-summary.json @@ -19,6 +19,8 @@ "hitAt10": 1, "mrrAt10": 0.875, "ndcgAt10": 0.9127302324517832, + "distinctTop3Ratio": 1, + "rawDistinctTop3Ratio": 1, "latencyMs": { "p50": 26.173166000000037, "p95": 52.931082999999944, diff --git a/benchmarks/budgets/default.json b/benchmarks/budgets/default.json index 67c1ca3..9f62fde 100644 --- a/benchmarks/budgets/default.json +++ b/benchmarks/budgets/default.json @@ -5,9 +5,11 @@ "thresholds": { "hitAt5MaxDrop": 0.03, "mrrAt10MaxDrop": 0.03, + "rawDistinctTop3RatioMaxDrop": 0.1, "p95LatencyMaxMultiplier": 1.35, "p95LatencyMaxAbsoluteMs": 4000, "minHitAt5": 0.4, - "minMrrAt10": 0.25 + "minMrrAt10": 0.25, + "minRawDistinctTop3Ratio": 0.5 } } diff --git a/benchmarks/budgets/github-models.json b/benchmarks/budgets/github-models.json index ca73498..cee0cac 100644 --- a/benchmarks/budgets/github-models.json +++ b/benchmarks/budgets/github-models.json @@ -4,6 +4,7 @@ "thresholds": { "minHitAt5": 0.5, "minMrrAt10": 0.45, + "minRawDistinctTop3Ratio": 0.5, "p95LatencyMaxAbsoluteMs": 500 } } From 01d4082d168977f1960061a47c3add18e7fc01d5 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 22:57:02 +0200 Subject: [PATCH 17/20] fix: keep reranker config optional after rebase --- src/config/schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/schema.ts b/src/config/schema.ts index be005ff..26c938c 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -127,7 +127,7 @@ export type ParsedCodebaseIndexConfig = CodebaseIndexConfig & { indexing: IndexingConfig; search: SearchConfig; debug: DebugConfig; - reranker: RerankerConfig; + reranker?: RerankerConfig; knowledgeBases: string[]; additionalInclude: string[]; }; From 2372debdaf9e7f960f326c663679955e8f2bc7c2 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sat, 11 Apr 2026 22:57:11 +0200 Subject: [PATCH 18/20] fix: preserve doc intent after rerank rebase merge --- src/indexer/index.ts | 77 ++++---------------------------- tests/retrieval-ranking.test.ts | 59 ++++++++++++++++++++++++ tests/search-integration.test.ts | 4 +- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 29ea48f..0a806c9 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -1590,14 +1590,18 @@ export class Indexer { return candidates; } - if (options?.definitionIntent === true || options?.hasIdentifierHints === true) { - return candidates; - } - const queryTokens = Array.from(tokenizeTextForRanking(query)); const preferSourcePaths = classifyQueryIntentRaw(query) === "source"; const docIntent = classifyDocIntent(queryTokens) === "docs"; + if (options?.definitionIntent === true) { + return candidates; + } + + if (options?.hasIdentifierHints === true && preferSourcePaths && !docIntent) { + return candidates; + } + const topN = Math.min(reranker.topN, candidates.length); const head = candidates.slice(0, topN); const tail = candidates.slice(topN); @@ -2867,70 +2871,7 @@ export class Indexer { : baseFiltered ).slice(0, maxResults); - // Apply reranking if enabled and available - let finalResults = filtered; - if (this.reranker?.isAvailable() && filtered.length > 1) { - const rerankStartTime = performance.now(); - - // Read content for reranking - const documentsForRerank = await Promise.all( - filtered.map(async (r) => { - try { - const fileContent = await fsPromises.readFile(r.metadata.filePath, "utf-8"); - const lines = fileContent.split("\n"); - return lines.slice(r.metadata.startLine - 1, r.metadata.endLine).join("\n"); - } catch { - return r.metadata.name ?? r.metadata.chunkType; - } - }) - ); - - try { - const rerankResponse = await this.reranker.rerank( - query, - documentsForRerank, - this.config.reranker?.topN ?? filtered.length - ); - - if (rerankResponse.results.length > 0) { - // Create a map of original index to rerank score - const rerankScores = new Map(); - for (const result of rerankResponse.results) { - rerankScores.set(result.index, result.relevanceScore); - } - - // Reorder results based on rerank scores - const rerankedIndices = rerankResponse.results - .sort((a, b) => b.relevanceScore - a.relevanceScore) - .map(r => r.index); - - // Build final results: reranked first, then remaining - const rerankedSet = new Set(rerankedIndices); - const reranked = rerankedIndices - .filter(idx => idx < filtered.length) - .map(idx => ({ - ...filtered[idx], - score: rerankScores.get(idx) ?? filtered[idx].score, - })); - const remaining = filtered - .filter((_, idx) => !rerankedSet.has(idx)); - - finalResults = [...reranked, ...remaining].slice(0, maxResults); - } - - const rerankMs = performance.now() - rerankStartTime; - this.logger.search("debug", "Reranking complete", { - documentsReranked: documentsForRerank.length, - rerankMs: Math.round(rerankMs * 100) / 100, - tokensUsed: rerankResponse.tokensUsed, - }); - } catch (error) { - // Reranking failed, use original results - this.logger.search("warn", "Reranking failed, using original results", { - error: error instanceof Error ? error.message : String(error), - }); - } - } + const finalResults = filtered; const totalSearchMs = performance.now() - searchStartTime; this.logger.recordSearch(totalSearchMs, { diff --git a/tests/retrieval-ranking.test.ts b/tests/retrieval-ranking.test.ts index 82ab623..64d45f6 100644 --- a/tests/retrieval-ranking.test.ts +++ b/tests/retrieval-ranking.test.ts @@ -547,6 +547,65 @@ describe("retrieval ranking", () => { globalThis.fetch = fetchSpy; }); + it("allows external reranker for documentation intent even when identifier hints are present", async () => { + const config = parseConfig({ + embeddingProvider: "custom", + customProvider: { + baseUrl: "http://localhost:11434/v1", + model: "mock-embed", + dimensions: 8, + }, + reranker: { + enabled: true, + provider: "custom", + model: "mock-reranker", + baseUrl: "https://rerank.example/v1", + topN: 3, + }, + }); + const indexer = new Indexer("/repo", config); + + const fetchSpy = globalThis.fetch; + let rerankCalled = false; + let rerankDocuments: string[] | undefined; + globalThis.fetch = (async (input, init) => { + if (String(input).includes("/rerank")) { + rerankCalled = true; + rerankDocuments = (JSON.parse(String(init?.body ?? "{}")) as { documents?: string[] }).documents; + return new Response(JSON.stringify({ + results: [ + { index: 1, relevance_score: 0.99 }, + { index: 0, relevance_score: 0.6 }, + ], + }), { status: 200 }); + } + return new Response(JSON.stringify({ data: [{ embedding: Array.from({ length: 8 }, () => 0.1) }], usage: { total_tokens: 1 } }), { status: 200 }); + }) as typeof fetch; + + const candidates: Candidate[] = [ + { id: "impl", score: 0.9, metadata: meta({ filePath: "/repo/src/indexer/index.ts", name: "rankHybridResults", chunkType: "function", startLine: 1, endLine: 3 }) }, + { id: "docs-readme", score: 0.89, metadata: meta({ filePath: "/repo/README.md", name: "retrieval documentation", chunkType: "other", startLine: 1, endLine: 3 }) }, + { id: "docs-guide", score: 0.88, metadata: meta({ filePath: "/repo/docs/guide.md", name: "rankHybridResults guide", chunkType: "other", startLine: 1, endLine: 3 }) }, + ]; + + const reranked = await (indexer as unknown as { + rerankCandidatesWithApi( + query: string, + items: Candidate[], + options?: { definitionIntent?: boolean; hasIdentifierHints?: boolean } + ): Promise; + }).rerankCandidatesWithApi("rankHybridResults documentation guide", candidates, { + hasIdentifierHints: true, + }); + + expect(rerankCalled).toBe(true); + expect(reranked.map((candidate) => candidate.id)).toEqual(["docs-guide", "docs-readme", "impl"]); + expect(rerankDocuments?.length).toBe(2); + expect(rerankDocuments?.[0]).toContain("path: /repo/README.md"); + expect(rerankDocuments?.[1]).toContain("path: /repo/docs/guide.md"); + globalThis.fetch = fetchSpy; + }); + it("diversifies external reranker output for exploratory queries", async () => { const config = parseConfig({ embeddingProvider: "custom", diff --git a/tests/search-integration.test.ts b/tests/search-integration.test.ts index b2bbc72..0272d23 100644 --- a/tests/search-integration.test.ts +++ b/tests/search-integration.test.ts @@ -133,7 +133,7 @@ export function rerankResults(query: string) { return rankHybridResults(query); const indexer = new Indexer(tempDir, config); await indexer.index(); - const results = await indexer.search("where is rankHybridResults documentation", 5, { + const results = await indexer.search("rankHybridResults documentation guide", 5, { metadataOnly: true, filterByBranch: false, }); @@ -338,7 +338,7 @@ export function rerankResults(query: string) { return rankHybridResults(query); const indexer = new Indexer(tempDir, config); await indexer.index(); - const results = await indexer.search("where is rankHybridResults documentation", 5, { + const results = await indexer.search("rankHybridResults documentation guide", 5, { metadataOnly: true, filterByBranch: false, }); From 454766fa28bcc6c4347ca22a671fa9bb25907f0e Mon Sep 17 00:00:00 2001 From: Helweg Date: Sun, 12 Apr 2026 18:18:45 +0200 Subject: [PATCH 19/20] fix: reject legacy eval baselines missing diversity metrics --- src/eval/reports.ts | 27 +++++++++++- tests/eval-runner.test.ts | 88 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/src/eval/reports.ts b/src/eval/reports.ts index f55c21c..e9b2136 100644 --- a/src/eval/reports.ts +++ b/src/eval/reports.ts @@ -9,6 +9,31 @@ import type { SweepAggregateReport, } from "./types.js"; +function assertFiniteNumber(value: unknown, path: string): number { + if (typeof value !== "number" || Number.isNaN(value) || !Number.isFinite(value)) { + throw new Error(`${path} must be a finite number`); + } + return value; +} + +function validateSummary(summary: EvalSummary, summaryPath: string): EvalSummary { + assertFiniteNumber(summary.metrics.hitAt1, `${summaryPath}.metrics.hitAt1`); + assertFiniteNumber(summary.metrics.hitAt3, `${summaryPath}.metrics.hitAt3`); + assertFiniteNumber(summary.metrics.hitAt5, `${summaryPath}.metrics.hitAt5`); + assertFiniteNumber(summary.metrics.hitAt10, `${summaryPath}.metrics.hitAt10`); + assertFiniteNumber(summary.metrics.mrrAt10, `${summaryPath}.metrics.mrrAt10`); + assertFiniteNumber(summary.metrics.ndcgAt10, `${summaryPath}.metrics.ndcgAt10`); + assertFiniteNumber(summary.metrics.distinctTop3Ratio, `${summaryPath}.metrics.distinctTop3Ratio`); + assertFiniteNumber(summary.metrics.rawDistinctTop3Ratio, `${summaryPath}.metrics.rawDistinctTop3Ratio`); + assertFiniteNumber(summary.metrics.latencyMs.p50, `${summaryPath}.metrics.latencyMs.p50`); + assertFiniteNumber(summary.metrics.latencyMs.p95, `${summaryPath}.metrics.latencyMs.p95`); + assertFiniteNumber(summary.metrics.latencyMs.p99, `${summaryPath}.metrics.latencyMs.p99`); + assertFiniteNumber(summary.metrics.embedding.callCount, `${summaryPath}.metrics.embedding.callCount`); + assertFiniteNumber(summary.metrics.embedding.estimatedCostUsd, `${summaryPath}.metrics.embedding.estimatedCostUsd`); + + return summary; +} + function formatPct(value: number): string { return `${(value * 100).toFixed(2)}%`; } @@ -28,7 +53,7 @@ function signed(value: number, digits = 4): string { export function loadSummary(summaryPath: string): EvalSummary { const raw = readFileSync(summaryPath, "utf-8"); - return JSON.parse(raw) as EvalSummary; + return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath); } export function createRunDirectory(outputRoot: string, timestampOverride?: string): string { diff --git a/tests/eval-runner.test.ts b/tests/eval-runner.test.ts index 558201b..7aac67d 100644 --- a/tests/eval-runner.test.ts +++ b/tests/eval-runner.test.ts @@ -161,6 +161,94 @@ describe("eval runner", () => { expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"deltas\""); }); + it("fails fast when baseline summary is missing required diversity metrics", async () => { + const baselineRun = await runEvaluation({ + projectRoot: tempDir, + datasetPath: "benchmarks/golden/small.json", + outputRoot: "benchmarks/results", + ciMode: false, + reindex: false, + }); + + const legacyBaseline = { + ...baselineRun.summary, + metrics: { + ...baselineRun.summary.metrics, + }, + } as Record; + + delete (legacyBaseline.metrics as Record).distinctTop3Ratio; + delete (legacyBaseline.metrics as Record).rawDistinctTop3Ratio; + + const baselinePath = path.join(tempDir, "benchmarks", "baselines", "legacy-baseline-summary.json"); + writeFileSync(baselinePath, JSON.stringify(legacyBaseline, null, 2), "utf-8"); + + await expect( + runEvaluation({ + projectRoot: tempDir, + datasetPath: "benchmarks/golden/small.json", + outputRoot: "benchmarks/results", + againstPath: "benchmarks/baselines/legacy-baseline-summary.json", + ciMode: false, + reindex: false, + }) + ).rejects.toThrow(/metrics\.distinctTop3Ratio must be a finite number/); + }); + + it("fails ci mode when budget baseline summary is missing required diversity metrics", async () => { + const baselineRun = await runEvaluation({ + projectRoot: tempDir, + datasetPath: "benchmarks/golden/small.json", + outputRoot: "benchmarks/results", + ciMode: false, + reindex: false, + }); + + const legacyBaseline = { + ...baselineRun.summary, + metrics: { + ...baselineRun.summary.metrics, + }, + } as Record; + + delete (legacyBaseline.metrics as Record).distinctTop3Ratio; + delete (legacyBaseline.metrics as Record).rawDistinctTop3Ratio; + + writeFileSync( + path.join(tempDir, "benchmarks", "baselines", "legacy-baseline-summary.json"), + JSON.stringify(legacyBaseline, null, 2), + "utf-8" + ); + + writeFileSync( + path.join(tempDir, "benchmarks", "budgets", "legacy-check.json"), + JSON.stringify( + { + name: "legacy-check", + baselinePath: "benchmarks/baselines/legacy-baseline-summary.json", + failOnMissingBaseline: true, + thresholds: { + rawDistinctTop3RatioMaxDrop: 0.1, + }, + }, + null, + 2 + ), + "utf-8" + ); + + await expect( + runEvaluation({ + projectRoot: tempDir, + datasetPath: "benchmarks/golden/small.json", + outputRoot: "benchmarks/results", + ciMode: true, + budgetPath: "benchmarks/budgets/legacy-check.json", + reindex: false, + }) + ).rejects.toThrow(/metrics\.distinctTop3Ratio must be a finite number/); + }); + it("fails ci gate when thresholds regress beyond tolerance", async () => { const baselineRun = await runEvaluation({ projectRoot: tempDir, From 926c828208fc1f904078bbb1d773cc48f1283815 Mon Sep 17 00:00:00 2001 From: Helweg Date: Sun, 12 Apr 2026 18:58:50 +0200 Subject: [PATCH 20/20] fix: allow eval diff to read legacy summaries --- src/eval/cli.ts | 8 +++++-- src/eval/reports.ts | 31 +++++++++++++++++++++----- tests/eval-cli.test.ts | 50 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 7 deletions(-) diff --git a/src/eval/cli.ts b/src/eval/cli.ts index 62b1bcd..4b77efb 100644 --- a/src/eval/cli.ts +++ b/src/eval/cli.ts @@ -336,8 +336,12 @@ export async function handleEvalCommand(args: string[], cwd: string): Promise 0 ? `+${formatted}` : formatted; } -export function loadSummary(summaryPath: string): EvalSummary { +export function loadSummary(summaryPath: string, options?: LoadSummaryOptions): EvalSummary { const raw = readFileSync(summaryPath, "utf-8"); - return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath); + return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath, options); } export function createRunDirectory(outputRoot: string, timestampOverride?: string): string { diff --git a/tests/eval-cli.test.ts b/tests/eval-cli.test.ts index ff60715..6ae5768 100644 --- a/tests/eval-cli.test.ts +++ b/tests/eval-cli.test.ts @@ -108,6 +108,8 @@ describe("eval cli", () => { hitAt10: 1, mrrAt10: 1, ndcgAt10: 1, + distinctTop3Ratio: 1, + rawDistinctTop3Ratio: 1, latencyMs: { p50: 1, p95: 2, p99: 3 }, tokenEstimate: { queryTokens: 10, embeddingTokensUsed: 20 }, embedding: { callCount: 1, estimatedCostUsd: 0, costPer1MTokensUsd: 0 }, @@ -130,4 +132,52 @@ describe("eval cli", () => { expect(exitCode).toBe(0); }); + + it("allows eval diff to read legacy summaries missing diversity metrics", async () => { + const currentSummaryPath = path.join(tempDir, "current.json"); + const baselineSummaryPath = path.join(tempDir, "baseline.json"); + + const legacySummary = { + generatedAt: new Date().toISOString(), + projectRoot: tempDir, + datasetPath: "benchmarks/golden/small.json", + datasetName: "small", + datasetVersion: "1.0.0", + queryCount: 1, + topK: 10, + searchConfig: { + fusionStrategy: "rrf", + hybridWeight: 0.4, + rrfK: 60, + rerankTopN: 20, + }, + metrics: { + hitAt1: 1, + hitAt3: 1, + hitAt5: 1, + hitAt10: 1, + mrrAt10: 1, + ndcgAt10: 1, + latencyMs: { p50: 1, p95: 2, p99: 3 }, + tokenEstimate: { queryTokens: 10, embeddingTokensUsed: 20 }, + embedding: { callCount: 1, estimatedCostUsd: 0, costPer1MTokensUsd: 0 }, + failureBuckets: { + "wrong-file": 0, + "wrong-symbol": 0, + "docs-tests-outranking-source": 0, + "no-relevant-hit-top-k": 0, + }, + }, + }; + + writeFileSync(currentSummaryPath, JSON.stringify(legacySummary, null, 2), "utf-8"); + writeFileSync(baselineSummaryPath, JSON.stringify(legacySummary, null, 2), "utf-8"); + + const exitCode = await handleEvalCommand( + ["diff", "--current", "current.json", "--against", "baseline.json"], + tempDir + ); + + expect(exitCode).toBe(0); + }); });