Skip to content

Commit a77e26c

Browse files
authored
Merge pull request #52 from Helweg/feature/pluggable-reranker
feat: add pluggable reranker and diversity eval gating
2 parents 363b7b0 + 926c828 commit a77e26c

23 files changed

+1416
-142
lines changed

README.md

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -532,18 +532,14 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
532532
"rerankTopN": 20, // Deterministic rerank depth
533533
"contextLines": 0 // Extra lines before/after match
534534
},
535-
536-
// === Reranking API ===
537535
"reranker": {
538-
"enabled": true, // Enable API reranking
539-
"baseUrl": "https://api.siliconflow.cn/v1",
540-
"model": "BAAI/bge-reranker-v2-m3",
541-
"apiKey": "{env:SILICONFLOW_API_KEY}",
542-
"topN": 20, // Number of results to rerank
543-
"timeoutMs": 30000 // Request timeout (ms)
536+
"enabled": false,
537+
"provider": "cohere",
538+
"model": "rerank-v3.5",
539+
"apiKey": "{env:RERANK_API_KEY}",
540+
"topN": 15,
541+
"timeoutMs": 10000
544542
},
545-
546-
// === Debug ===
547543
"debug": {
548544
"enabled": false, // Enable debug logging
549545
"logLevel": "info", // error | warn | info | debug
@@ -604,23 +600,14 @@ String values in `codebase-index.json` can reference environment variables with
604600
| `rrfK` | `60` | RRF smoothing constant. Higher values flatten rank impact, lower values prioritize top-ranked candidates more strongly |
605601
| `rerankTopN` | `20` | Deterministic rerank depth cap. Applies lightweight name/path/chunk-type rerank to top-N only |
606602
| `contextLines` | `0` | Extra lines to include before/after each match |
607-
| **reranker** | | |
608-
| `reranker.enabled` | `false` | Enable API-based reranking |
609-
| `reranker.baseUrl` | - | Rerank API endpoint URL |
610-
| `reranker.model` | - | Reranking model name (e.g. `BAAI/bge-reranker-v2-m3`) |
611-
| `reranker.apiKey` | - | API key for reranking service (use `{env:VAR}` for security) |
612-
| `reranker.topN` | `20` | Number of top results to rerank via API |
613-
| `reranker.timeoutMs` | `30000` | Rerank API request timeout in milliseconds |
614-
| **customProvider** | | |
615-
| `customProvider.baseUrl` | - | Base URL of OpenAI-compatible embeddings API (e.g. `https://api.siliconflow.cn/v1`) |
616-
| `customProvider.model` | - | Model name (e.g. `BAAI/bge-m3`, `nomic-embed-text`) |
617-
| `customProvider.dimensions` | - | Vector dimensions (e.g. `1024` for BGE-M3, `768` for nomic-embed-text) |
618-
| `customProvider.apiKey` | - | API key (use `{env:VAR}` for security) |
619-
| `customProvider.maxTokens` | `8192` | Max tokens per input text |
620-
| `customProvider.timeoutMs` | `30000` | Request timeout in milliseconds |
621-
| `customProvider.concurrency` | `3` | Max concurrent embedding requests |
622-
| `customProvider.requestIntervalMs` | `1000` | Minimum delay between requests (ms). Set to `0` for local servers |
623-
| `customProvider.maxBatchSize` | - | Max inputs per `/embeddings` request. Cap for servers with batch limits |
603+
| **reranker** | | Optional second-stage model reranker for the top candidate pool |
604+
| `enabled` | `false` | Turn external reranking on/off |
605+
| `provider` | `"custom"` | Hosted shortcuts: `cohere`, `jina`, or `custom` |
606+
| `model` || Reranker model name required when enabled |
607+
| `baseUrl` | provider default | Override reranker endpoint base URL. `cohere``https://api.cohere.ai/v1`, `jina``https://api.jina.ai/v1` |
608+
| `apiKey` || API key for hosted reranker providers |
609+
| `topN` | `15` | Number of top candidates to send to the external reranker |
610+
| `timeoutMs` | `10000` | Timeout for external rerank requests |
624611
| **debug** | | |
625612
| `enabled` | `false` | Enable debug logging and metrics collection |
626613
| `logLevel` | `"info"` | Log level: `error`, `warn`, `info`, `debug` |
@@ -633,9 +620,10 @@ String values in `codebase-index.json` can reference environment variables with
633620

634621
### Retrieval ranking behavior
635622

636-
- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → filtering.
623+
- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → optional external reranker (`reranker`) → filtering.
637624
- `find_similar` stays semantic-only: semantic retrieval + deterministic rerank only (no keyword retrieval, no RRF).
638625
- For compatibility rollbacks, set `search.fusionStrategy` to `"weighted"` to use the legacy weighted fusion path.
626+
- When enabled, the external reranker sees path metadata plus a bounded on-disk code snippet for each candidate so it can distinguish real implementations from docs/tests more reliably.
639627
- Retrieval benchmark artifacts are separated by role:
640628
- baseline (versioned): `benchmarks/baselines/retrieval-baseline.json`
641629
- latest candidate run (generated): `benchmark-results/retrieval-candidate.json`

benchmarks/baselines/eval-baseline-summary.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
"hitAt10": 1,
2020
"mrrAt10": 0.875,
2121
"ndcgAt10": 0.9127302324517832,
22+
"distinctTop3Ratio": 1,
23+
"rawDistinctTop3Ratio": 1,
2224
"latencyMs": {
2325
"p50": 26.173166000000037,
2426
"p95": 52.931082999999944,

benchmarks/baselines/retrieval-baseline.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"generatedAt": "2026-03-13T14:21:43.213Z",
33
"queryCount": 3,
44
"hitAt5": 1,
5+
"distinctTop3Ratio": 0.9166666666666666,
56
"medianMs": 0.010916000000008808,
67
"p95Ms": 0.024666000000024724
78
}

benchmarks/budgets/default.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
"thresholds": {
66
"hitAt5MaxDrop": 0.03,
77
"mrrAt10MaxDrop": 0.03,
8+
"rawDistinctTop3RatioMaxDrop": 0.1,
89
"p95LatencyMaxMultiplier": 1.35,
910
"p95LatencyMaxAbsoluteMs": 4000,
1011
"minHitAt5": 0.4,
11-
"minMrrAt10": 0.25
12+
"minMrrAt10": 0.25,
13+
"minRawDistinctTop3Ratio": 0.5
1214
}
1315
}

benchmarks/budgets/github-models.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"thresholds": {
55
"minHitAt5": 0.5,
66
"minMrrAt10": 0.45,
7+
"minRawDistinctTop3Ratio": 0.5,
78
"p95LatencyMaxAbsoluteMs": 500
89
}
910
}

src/config/schema.ts

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,25 @@ export interface SearchConfig {
4949
contextLines: number;
5050
}
5151

52+
export type RerankerProvider = "cohere" | "jina" | "custom";
53+
54+
export interface RerankerConfig {
55+
/** Whether to enable reranking. Default: false */
56+
enabled: boolean;
57+
/** Provider shortcut for hosted rerank APIs. Use 'custom' to provide only baseUrl. */
58+
provider: RerankerProvider;
59+
/** Model name for reranking */
60+
model: string;
61+
/** Base URL of the rerank API endpoint */
62+
baseUrl: string;
63+
/** API key for the rerank service */
64+
apiKey?: string;
65+
/** Number of top documents to rerank */
66+
topN: number;
67+
/** Request timeout in milliseconds */
68+
timeoutMs: number;
69+
}
70+
5271
export type LogLevel = "error" | "warn" | "info" | "debug";
5372

5473
export interface DebugConfig {
@@ -83,21 +102,6 @@ export interface CustomProviderConfig {
83102
max_batch_size?: number;
84103
}
85104

86-
export interface RerankerConfig {
87-
/** Whether to enable reranking. Default: false */
88-
enabled: boolean;
89-
/** Base URL of the rerank API endpoint (e.g. "https://api.siliconflow.cn/v1") */
90-
baseUrl: string;
91-
/** Model name for reranking (e.g. "BAAI/bge-reranker-v2-m3") */
92-
model: string;
93-
/** API key for the rerank service */
94-
apiKey?: string;
95-
/** Number of top documents to rerank. Default: 20 */
96-
topN?: number;
97-
/** Request timeout in milliseconds. Default: 30000 */
98-
timeoutMs?: number;
99-
}
100-
101105
export interface CodebaseIndexConfig {
102106
embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
103107
embeddingModel?: EmbeddingModelName;
@@ -123,7 +127,7 @@ export type ParsedCodebaseIndexConfig = CodebaseIndexConfig & {
123127
indexing: IndexingConfig;
124128
search: SearchConfig;
125129
debug: DebugConfig;
126-
reranker: RerankerConfig;
130+
reranker?: RerankerConfig;
127131
knowledgeBases: string[];
128132
additionalInclude: string[];
129133
};
@@ -164,6 +168,21 @@ function isValidFusionStrategy(value: unknown): value is SearchConfig["fusionStr
164168
return value === "weighted" || value === "rrf";
165169
}
166170

171+
function isValidRerankerProvider(value: unknown): value is RerankerProvider {
172+
return value === "cohere" || value === "jina" || value === "custom";
173+
}
174+
175+
function getDefaultRerankerBaseUrl(provider: RerankerProvider): string {
176+
switch (provider) {
177+
case "cohere":
178+
return "https://api.cohere.ai/v1";
179+
case "jina":
180+
return "https://api.jina.ai/v1";
181+
case "custom":
182+
return "";
183+
}
184+
}
185+
167186
function getDefaultDebugConfig(): DebugConfig {
168187
return {
169188
enabled: false,
@@ -177,16 +196,6 @@ function getDefaultDebugConfig(): DebugConfig {
177196
};
178197
}
179198

180-
function getDefaultRerankerConfig(): RerankerConfig {
181-
return {
182-
enabled: false,
183-
baseUrl: "https://api.siliconflow.cn/v1",
184-
model: "BAAI/bge-reranker-v2-m3",
185-
topN: 20,
186-
timeoutMs: 30000,
187-
};
188-
}
189-
190199
const VALID_SCOPES: IndexScope[] = ["project", "global"];
191200
const VALID_LOG_LEVELS: LogLevel[] = ["error", "warn", "info", "debug"];
192201

@@ -282,17 +291,6 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
282291
metrics: typeof rawDebug.metrics === "boolean" ? rawDebug.metrics : defaultDebug.metrics,
283292
};
284293

285-
const defaultReranker = getDefaultRerankerConfig();
286-
const rawReranker = (input.reranker && typeof input.reranker === "object" ? input.reranker : {}) as Record<string, unknown>;
287-
const reranker: RerankerConfig = {
288-
enabled: typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : defaultReranker.enabled,
289-
baseUrl: typeof rawReranker.baseUrl === "string" ? rawReranker.baseUrl.trim().replace(/\/+$/, '') : defaultReranker.baseUrl,
290-
model: typeof rawReranker.model === "string" ? rawReranker.model : defaultReranker.model,
291-
apiKey: getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey"),
292-
topN: typeof rawReranker.topN === "number" ? Math.max(1, Math.min(200, Math.floor(rawReranker.topN))) : defaultReranker.topN,
293-
timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, rawReranker.timeoutMs) : defaultReranker.timeoutMs,
294-
};
295-
296294
const rawKnowledgeBases = input.knowledgeBases;
297295
const knowledgeBases: string[] = isStringArray(rawKnowledgeBases)
298296
? rawKnowledgeBases.filter(p => typeof p === "string" && p.trim().length > 0).map(p => p.trim())
@@ -306,6 +304,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
306304
let embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
307305
let embeddingModel: EmbeddingModelName | undefined = undefined;
308306
let customProvider: CustomProviderConfig | undefined = undefined;
307+
let reranker: RerankerConfig | undefined = undefined;
309308

310309
if (embeddingProviderValue === 'custom') {
311310
embeddingProvider = 'custom';
@@ -359,6 +358,39 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
359358
embeddingProvider = 'auto';
360359
}
361360

361+
const rawReranker = (input.reranker && typeof input.reranker === "object"
362+
? input.reranker
363+
: {}) as Record<string, unknown>;
364+
const rerankerEnabled = typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : false;
365+
if (rerankerEnabled) {
366+
const provider = isValidRerankerProvider(rawReranker.provider) ? rawReranker.provider : "custom";
367+
const model = getResolvedString(rawReranker.model, "$root.reranker.model");
368+
if (!model || model.trim().length === 0) {
369+
throw new Error("reranker is enabled but reranker.model is missing or invalid.");
370+
}
371+
372+
const configuredBaseUrl = getResolvedString(rawReranker.baseUrl, "$root.reranker.baseUrl");
373+
const baseUrl = configuredBaseUrl?.trim() || getDefaultRerankerBaseUrl(provider);
374+
if (baseUrl.length === 0) {
375+
throw new Error("reranker is enabled but reranker.baseUrl is missing or invalid for provider 'custom'.");
376+
}
377+
378+
const apiKey = getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey");
379+
if ((provider === "cohere" || provider === "jina") && (!apiKey || apiKey.trim().length === 0)) {
380+
throw new Error(`reranker provider '${provider}' requires reranker.apiKey when enabled.`);
381+
}
382+
383+
reranker = {
384+
enabled: true,
385+
provider,
386+
model: model.trim(),
387+
baseUrl: baseUrl.replace(/\/+$/, ""),
388+
apiKey: apiKey?.trim() || undefined,
389+
topN: typeof rawReranker.topN === "number" ? Math.min(50, Math.max(1, Math.floor(rawReranker.topN))) : 15,
390+
timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, Math.floor(rawReranker.timeoutMs)) : 10000,
391+
};
392+
}
393+
362394
return {
363395
embeddingProvider,
364396
embeddingModel,

src/eval/budget.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ export function evaluateBudgetGate(
2424
});
2525
}
2626

27+
if (
28+
thresholds.minRawDistinctTop3Ratio !== undefined &&
29+
summary.metrics.rawDistinctTop3Ratio < thresholds.minRawDistinctTop3Ratio
30+
) {
31+
violations.push({
32+
metric: "minRawDistinctTop3Ratio",
33+
message: `Raw Distinct Top@3 ${summary.metrics.rawDistinctTop3Ratio.toFixed(4)} is below minimum ${thresholds.minRawDistinctTop3Ratio.toFixed(4)}`,
34+
});
35+
}
36+
2737
if (comparison) {
2838
if (
2939
thresholds.hitAt5MaxDrop !== undefined &&
@@ -45,6 +55,16 @@ export function evaluateBudgetGate(
4555
});
4656
}
4757

58+
if (
59+
thresholds.rawDistinctTop3RatioMaxDrop !== undefined &&
60+
comparison.deltas.rawDistinctTop3Ratio.absolute < -thresholds.rawDistinctTop3RatioMaxDrop
61+
) {
62+
violations.push({
63+
metric: "rawDistinctTop3RatioMaxDrop",
64+
message: `Raw Distinct Top@3 drop ${comparison.deltas.rawDistinctTop3Ratio.absolute.toFixed(4)} exceeds allowed -${thresholds.rawDistinctTop3RatioMaxDrop.toFixed(4)}`,
65+
});
66+
}
67+
4868
if (thresholds.p95LatencyMaxMultiplier !== undefined) {
4969
const baselineP95 = comparison.deltas.latencyP95Ms.baseline;
5070
if (baselineP95 > BASELINE_P95_EPSILON_MS) {

src/eval/cli.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,12 @@ export async function handleEvalCommand(args: string[], cwd: string): Promise<nu
336336
if (!parsed.againstPath.endsWith(".json")) {
337337
throw new Error("eval diff --against must point to a summary JSON file");
338338
}
339-
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath));
340-
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath));
339+
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath), {
340+
allowLegacyDiversityMetrics: true,
341+
});
342+
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath), {
343+
allowLegacyDiversityMetrics: true,
344+
});
341345
const comparison = compareSummaries(
342346
currentSummary,
343347
baselineSummary,

src/eval/compare.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ export function compareSummaries(current: EvalSummary, baseline: EvalSummary, ag
2121
hitAt10: metricDelta(current.metrics.hitAt10, baseline.metrics.hitAt10),
2222
mrrAt10: metricDelta(current.metrics.mrrAt10, baseline.metrics.mrrAt10),
2323
ndcgAt10: metricDelta(current.metrics.ndcgAt10, baseline.metrics.ndcgAt10),
24+
distinctTop3Ratio: metricDelta(current.metrics.distinctTop3Ratio, baseline.metrics.distinctTop3Ratio),
25+
rawDistinctTop3Ratio: metricDelta(current.metrics.rawDistinctTop3Ratio, baseline.metrics.rawDistinctTop3Ratio),
2426
latencyP50Ms: metricDelta(current.metrics.latencyMs.p50, baseline.metrics.latencyMs.p50),
2527
latencyP95Ms: metricDelta(current.metrics.latencyMs.p95, baseline.metrics.latencyMs.p95),
2628
latencyP99Ms: metricDelta(current.metrics.latencyMs.p99, baseline.metrics.latencyMs.p99),

src/eval/metrics.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ function uniqueResultsByPath(results: PerQueryEvalResult["results"]): PerQueryEv
3939
return unique;
4040
}
4141

42+
function distinctTopKRatio(results: PerQueryEvalResult["results"], k: number): number {
43+
const top = results.slice(0, k);
44+
if (top.length === 0) return 0;
45+
const distinct = new Set(top.map((result) => normalizePath(result.filePath))).size;
46+
return distinct / top.length;
47+
}
48+
4249
export function pathMatchesExpected(actualPath: string, expectedPath: string): boolean {
4350
const actual = normalizePath(actualPath);
4451
const expected = normalizePath(expectedPath);
@@ -149,6 +156,7 @@ export function buildPerQueryResult(
149156
reciprocalRankAt10: reciprocalRankAtK(deduped, relevantPaths, 10),
150157
ndcgAt10: ndcgAtK(deduped, relevantPaths, 10),
151158
failureBucket: classifyFailureBucket(query, results, k),
159+
rawTop3DistinctRatio: distinctTopKRatio(results, 3),
152160
results: deduped,
153161
};
154162

@@ -172,6 +180,8 @@ export function computeEvalMetrics(
172180
hitAt10: 0,
173181
mrrAt10: 0,
174182
ndcgAt10: 0,
183+
distinctTop3Ratio: 0,
184+
rawDistinctTop3Ratio: 0,
175185
};
176186

177187
const failureBuckets: Record<FailureBucket, number> = {
@@ -190,6 +200,8 @@ export function computeEvalMetrics(
190200
if (query.hitAt10) sum.hitAt10 += 1;
191201
sum.mrrAt10 += query.reciprocalRankAt10;
192202
sum.ndcgAt10 += query.ndcgAt10;
203+
sum.distinctTop3Ratio += distinctTopKRatio(query.results, 3);
204+
sum.rawDistinctTop3Ratio += query.rawTop3DistinctRatio;
193205
if (query.failureBucket) {
194206
failureBuckets[query.failureBucket] += 1;
195207
}
@@ -204,6 +216,8 @@ export function computeEvalMetrics(
204216
hitAt10: safeDiv(sum.hitAt10),
205217
mrrAt10: safeDiv(sum.mrrAt10),
206218
ndcgAt10: safeDiv(sum.ndcgAt10),
219+
distinctTop3Ratio: safeDiv(sum.distinctTop3Ratio),
220+
rawDistinctTop3Ratio: safeDiv(sum.rawDistinctTop3Ratio),
207221
latencyMs: {
208222
p50: percentile(latencies, 0.5),
209223
p95: percentile(latencies, 0.95),

0 commit comments

Comments
 (0)