Helweg
diff --git a/‎README.md‎
Lines changed: 16 additions & 28 deletions b/‎README.md‎
Lines changed: 16 additions & 28 deletions
diff --git a/‎benchmarks/baselines/eval-baseline-summary.json‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/baselines/eval-baseline-summary.json‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/baselines/retrieval-baseline.json‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/baselines/retrieval-baseline.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/budgets/default.json‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/budgets/default.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/budgets/github-models.json‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/budgets/github-models.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/config/schema.ts‎
Lines changed: 69 additions & 37 deletions b/‎src/config/schema.ts‎
Lines changed: 69 additions & 37 deletions
diff --git a/‎src/eval/budget.ts‎
Lines changed: 20 additions & 0 deletions b/‎src/eval/budget.ts‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/eval/cli.ts‎
Lines changed: 6 additions & 2 deletions b/‎src/eval/cli.ts‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/eval/compare.ts‎
Lines changed: 2 additions & 0 deletions b/‎src/eval/compare.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/eval/metrics.ts‎
Lines changed: 14 additions & 0 deletions b/‎src/eval/metrics.ts‎
Lines changed: 14 additions & 0 deletions
@@ -532,18 +532,14 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
     "rerankTopN": 20,                         // Deterministic rerank depth
     "contextLines": 0                         // Extra lines before/after match
   },
-
-  // === Reranking API ===
   "reranker": {
-    "enabled": true,                          // Enable API reranking
-    "baseUrl": "https://api.siliconflow.cn/v1",
-    "model": "BAAI/bge-reranker-v2-m3",
-    "apiKey": "{env:SILICONFLOW_API_KEY}",
-    "topN": 20,                               // Number of results to rerank
-    "timeoutMs": 30000                        // Request timeout (ms)
+    "enabled": false,
+    "provider": "cohere",
+    "model": "rerank-v3.5",
+    "apiKey": "{env:RERANK_API_KEY}",
+    "topN": 15,
+    "timeoutMs": 10000
   },
-
-  // === Debug ===
   "debug": {
     "enabled": false,                         // Enable debug logging
     "logLevel": "info",                       // error | warn | info | debug
@@ -604,23 +600,14 @@ String values in `codebase-index.json` can reference environment variables with
 | `rrfK` | `60` | RRF smoothing constant. Higher values flatten rank impact, lower values prioritize top-ranked candidates more strongly |
 | `rerankTopN` | `20` | Deterministic rerank depth cap. Applies lightweight name/path/chunk-type rerank to top-N only |
 | `contextLines` | `0` | Extra lines to include before/after each match |
-| **reranker** | | |
-| `reranker.enabled` | `false` | Enable API-based reranking |
-| `reranker.baseUrl` | - | Rerank API endpoint URL |
-| `reranker.model` | - | Reranking model name (e.g. `BAAI/bge-reranker-v2-m3`) |
-| `reranker.apiKey` | - | API key for reranking service (use `{env:VAR}` for security) |
-| `reranker.topN` | `20` | Number of top results to rerank via API |
-| `reranker.timeoutMs` | `30000` | Rerank API request timeout in milliseconds |
-| **customProvider** | | |
-| `customProvider.baseUrl` | - | Base URL of OpenAI-compatible embeddings API (e.g. `https://api.siliconflow.cn/v1`) |
-| `customProvider.model` | - | Model name (e.g. `BAAI/bge-m3`, `nomic-embed-text`) |
-| `customProvider.dimensions` | - | Vector dimensions (e.g. `1024` for BGE-M3, `768` for nomic-embed-text) |
-| `customProvider.apiKey` | - | API key (use `{env:VAR}` for security) |
-| `customProvider.maxTokens` | `8192` | Max tokens per input text |
-| `customProvider.timeoutMs` | `30000` | Request timeout in milliseconds |
-| `customProvider.concurrency` | `3` | Max concurrent embedding requests |
-| `customProvider.requestIntervalMs` | `1000` | Minimum delay between requests (ms). Set to `0` for local servers |
-| `customProvider.maxBatchSize` | - | Max inputs per `/embeddings` request. Cap for servers with batch limits |
+| **reranker** | | Optional second-stage model reranker for the top candidate pool |
+| `enabled` | `false` | Turn external reranking on/off |
+| `provider` | `"custom"` | Hosted shortcuts: `cohere`, `jina`, or `custom` |
+| `model` | — | Reranker model name required when enabled |
+| `baseUrl` | provider default | Override reranker endpoint base URL. `cohere` → `https://api.cohere.ai/v1`, `jina` → `https://api.jina.ai/v1` |
+| `apiKey` | — | API key for hosted reranker providers |
+| `topN` | `15` | Number of top candidates to send to the external reranker |
+| `timeoutMs` | `10000` | Timeout for external rerank requests |
 | **debug** | | |
 | `enabled` | `false` | Enable debug logging and metrics collection |
 | `logLevel` | `"info"` | Log level: `error`, `warn`, `info`, `debug` |
@@ -633,9 +620,10 @@ String values in `codebase-index.json` can reference environment variables with
 
 ### Retrieval ranking behavior
 
-- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → filtering.
+- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → optional external reranker (`reranker`) → filtering.
 - `find_similar` stays semantic-only: semantic retrieval + deterministic rerank only (no keyword retrieval, no RRF).
 - For compatibility rollbacks, set `search.fusionStrategy` to `"weighted"` to use the legacy weighted fusion path.
+- When enabled, the external reranker sees path metadata plus a bounded on-disk code snippet for each candidate so it can distinguish real implementations from docs/tests more reliably.
 - Retrieval benchmark artifacts are separated by role:
   - baseline (versioned): `benchmarks/baselines/retrieval-baseline.json`
   - latest candidate run (generated): `benchmark-results/retrieval-candidate.json`
 
@@ -19,6 +19,8 @@
     "hitAt10": 1,
     "mrrAt10": 0.875,
     "ndcgAt10": 0.9127302324517832,
+    "distinctTop3Ratio": 1,
+    "rawDistinctTop3Ratio": 1,
     "latencyMs": {
       "p50": 26.173166000000037,
       "p95": 52.931082999999944,
 
@@ -2,6 +2,7 @@
   "generatedAt": "2026-03-13T14:21:43.213Z",
   "queryCount": 3,
   "hitAt5": 1,
+  "distinctTop3Ratio": 0.9166666666666666,
   "medianMs": 0.010916000000008808,
   "p95Ms": 0.024666000000024724
 }
@@ -5,9 +5,11 @@
   "thresholds": {
     "hitAt5MaxDrop": 0.03,
     "mrrAt10MaxDrop": 0.03,
+    "rawDistinctTop3RatioMaxDrop": 0.1,
     "p95LatencyMaxMultiplier": 1.35,
     "p95LatencyMaxAbsoluteMs": 4000,
     "minHitAt5": 0.4,
-    "minMrrAt10": 0.25
+    "minMrrAt10": 0.25,
+    "minRawDistinctTop3Ratio": 0.5
   }
 }
@@ -4,6 +4,7 @@
   "thresholds": {
     "minHitAt5": 0.5,
     "minMrrAt10": 0.45,
+    "minRawDistinctTop3Ratio": 0.5,
     "p95LatencyMaxAbsoluteMs": 500
   }
 }
@@ -49,6 +49,25 @@ export interface SearchConfig {
   contextLines: number;
 }
 
+export type RerankerProvider = "cohere" | "jina" | "custom";
+
+export interface RerankerConfig {
+  /** Whether to enable reranking. Default: false */
+  enabled: boolean;
+  /** Provider shortcut for hosted rerank APIs. Use 'custom' to provide only baseUrl. */
+  provider: RerankerProvider;
+  /** Model name for reranking */
+  model: string;
+  /** Base URL of the rerank API endpoint */
+  baseUrl: string;
+  /** API key for the rerank service */
+  apiKey?: string;
+  /** Number of top documents to rerank */
+  topN: number;
+  /** Request timeout in milliseconds */
+  timeoutMs: number;
+}
+
 export type LogLevel = "error" | "warn" | "info" | "debug";
 
 export interface DebugConfig {
@@ -83,21 +102,6 @@ export interface CustomProviderConfig {
   max_batch_size?: number;
 }
 
-export interface RerankerConfig {
-  /** Whether to enable reranking. Default: false */
-  enabled: boolean;
-  /** Base URL of the rerank API endpoint (e.g. "https://api.siliconflow.cn/v1") */
-  baseUrl: string;
-  /** Model name for reranking (e.g. "BAAI/bge-reranker-v2-m3") */
-  model: string;
-  /** API key for the rerank service */
-  apiKey?: string;
-  /** Number of top documents to rerank. Default: 20 */
-  topN?: number;
-  /** Request timeout in milliseconds. Default: 30000 */
-  timeoutMs?: number;
-}
-
 export interface CodebaseIndexConfig {
   embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
   embeddingModel?: EmbeddingModelName;
@@ -123,7 +127,7 @@ export type ParsedCodebaseIndexConfig = CodebaseIndexConfig & {
   indexing: IndexingConfig;
   search: SearchConfig;
   debug: DebugConfig;
-  reranker: RerankerConfig;
+  reranker?: RerankerConfig;
   knowledgeBases: string[];
   additionalInclude: string[];
 };
@@ -164,6 +168,21 @@ function isValidFusionStrategy(value: unknown): value is SearchConfig["fusionStr
   return value === "weighted" || value === "rrf";
 }
 
+function isValidRerankerProvider(value: unknown): value is RerankerProvider {
+  return value === "cohere" || value === "jina" || value === "custom";
+}
+
+function getDefaultRerankerBaseUrl(provider: RerankerProvider): string {
+  switch (provider) {
+    case "cohere":
+      return "https://api.cohere.ai/v1";
+    case "jina":
+      return "https://api.jina.ai/v1";
+    case "custom":
+      return "";
+  }
+}
+
 function getDefaultDebugConfig(): DebugConfig {
   return {
     enabled: false,
@@ -177,16 +196,6 @@ function getDefaultDebugConfig(): DebugConfig {
   };
 }
 
-function getDefaultRerankerConfig(): RerankerConfig {
-  return {
-    enabled: false,
-    baseUrl: "https://api.siliconflow.cn/v1",
-    model: "BAAI/bge-reranker-v2-m3",
-    topN: 20,
-    timeoutMs: 30000,
-  };
-}
-
 const VALID_SCOPES: IndexScope[] = ["project", "global"];
 const VALID_LOG_LEVELS: LogLevel[] = ["error", "warn", "info", "debug"];
 
@@ -282,17 +291,6 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
     metrics: typeof rawDebug.metrics === "boolean" ? rawDebug.metrics : defaultDebug.metrics,
   };
 
-  const defaultReranker = getDefaultRerankerConfig();
-  const rawReranker = (input.reranker && typeof input.reranker === "object" ? input.reranker : {}) as Record<string, unknown>;
-  const reranker: RerankerConfig = {
-    enabled: typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : defaultReranker.enabled,
-    baseUrl: typeof rawReranker.baseUrl === "string" ? rawReranker.baseUrl.trim().replace(/\/+$/, '') : defaultReranker.baseUrl,
-    model: typeof rawReranker.model === "string" ? rawReranker.model : defaultReranker.model,
-    apiKey: getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey"),
-    topN: typeof rawReranker.topN === "number" ? Math.max(1, Math.min(200, Math.floor(rawReranker.topN))) : defaultReranker.topN,
-    timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, rawReranker.timeoutMs) : defaultReranker.timeoutMs,
-  };
-
   const rawKnowledgeBases = input.knowledgeBases;
   const knowledgeBases: string[] = isStringArray(rawKnowledgeBases)
     ? rawKnowledgeBases.filter(p => typeof p === "string" && p.trim().length > 0).map(p => p.trim())
@@ -306,6 +304,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
   let embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
   let embeddingModel: EmbeddingModelName | undefined = undefined;
   let customProvider: CustomProviderConfig | undefined = undefined;
+  let reranker: RerankerConfig | undefined = undefined;
 
   if (embeddingProviderValue === 'custom') {
     embeddingProvider = 'custom';
@@ -359,6 +358,39 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
     embeddingProvider = 'auto';
   }
 
+  const rawReranker = (input.reranker && typeof input.reranker === "object"
+    ? input.reranker
+    : {}) as Record<string, unknown>;
+  const rerankerEnabled = typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : false;
+  if (rerankerEnabled) {
+    const provider = isValidRerankerProvider(rawReranker.provider) ? rawReranker.provider : "custom";
+    const model = getResolvedString(rawReranker.model, "$root.reranker.model");
+    if (!model || model.trim().length === 0) {
+      throw new Error("reranker is enabled but reranker.model is missing or invalid.");
+    }
+
+    const configuredBaseUrl = getResolvedString(rawReranker.baseUrl, "$root.reranker.baseUrl");
+    const baseUrl = configuredBaseUrl?.trim() || getDefaultRerankerBaseUrl(provider);
+    if (baseUrl.length === 0) {
+      throw new Error("reranker is enabled but reranker.baseUrl is missing or invalid for provider 'custom'.");
+    }
+
+    const apiKey = getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey");
+    if ((provider === "cohere" || provider === "jina") && (!apiKey || apiKey.trim().length === 0)) {
+      throw new Error(`reranker provider '${provider}' requires reranker.apiKey when enabled.`);
+    }
+
+    reranker = {
+      enabled: true,
+      provider,
+      model: model.trim(),
+      baseUrl: baseUrl.replace(/\/+$/, ""),
+      apiKey: apiKey?.trim() || undefined,
+      topN: typeof rawReranker.topN === "number" ? Math.min(50, Math.max(1, Math.floor(rawReranker.topN))) : 15,
+      timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, Math.floor(rawReranker.timeoutMs)) : 10000,
+    };
+  }
+
   return {
     embeddingProvider,
     embeddingModel,
 
@@ -24,6 +24,16 @@ export function evaluateBudgetGate(
     });
   }
 
+  if (
+    thresholds.minRawDistinctTop3Ratio !== undefined &&
+    summary.metrics.rawDistinctTop3Ratio < thresholds.minRawDistinctTop3Ratio
+  ) {
+    violations.push({
+      metric: "minRawDistinctTop3Ratio",
+      message: `Raw Distinct Top@3 ${summary.metrics.rawDistinctTop3Ratio.toFixed(4)} is below minimum ${thresholds.minRawDistinctTop3Ratio.toFixed(4)}`,
+    });
+  }
+
   if (comparison) {
     if (
       thresholds.hitAt5MaxDrop !== undefined &&
@@ -45,6 +55,16 @@ export function evaluateBudgetGate(
       });
     }
 
+    if (
+      thresholds.rawDistinctTop3RatioMaxDrop !== undefined &&
+      comparison.deltas.rawDistinctTop3Ratio.absolute < -thresholds.rawDistinctTop3RatioMaxDrop
+    ) {
+      violations.push({
+        metric: "rawDistinctTop3RatioMaxDrop",
+        message: `Raw Distinct Top@3 drop ${comparison.deltas.rawDistinctTop3Ratio.absolute.toFixed(4)} exceeds allowed -${thresholds.rawDistinctTop3RatioMaxDrop.toFixed(4)}`,
+      });
+    }
+
     if (thresholds.p95LatencyMaxMultiplier !== undefined) {
       const baselineP95 = comparison.deltas.latencyP95Ms.baseline;
       if (baselineP95 > BASELINE_P95_EPSILON_MS) {
 
@@ -336,8 +336,12 @@ export async function handleEvalCommand(args: string[], cwd: string): Promise<nu
     if (!parsed.againstPath.endsWith(".json")) {
       throw new Error("eval diff --against must point to a summary JSON file");
     }
-    const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath));
-    const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath));
+    const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath), {
+      allowLegacyDiversityMetrics: true,
+    });
+    const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath), {
+      allowLegacyDiversityMetrics: true,
+    });
     const comparison = compareSummaries(
       currentSummary,
       baselineSummary,
 
@@ -21,6 +21,8 @@ export function compareSummaries(current: EvalSummary, baseline: EvalSummary, ag
       hitAt10: metricDelta(current.metrics.hitAt10, baseline.metrics.hitAt10),
       mrrAt10: metricDelta(current.metrics.mrrAt10, baseline.metrics.mrrAt10),
       ndcgAt10: metricDelta(current.metrics.ndcgAt10, baseline.metrics.ndcgAt10),
+      distinctTop3Ratio: metricDelta(current.metrics.distinctTop3Ratio, baseline.metrics.distinctTop3Ratio),
+      rawDistinctTop3Ratio: metricDelta(current.metrics.rawDistinctTop3Ratio, baseline.metrics.rawDistinctTop3Ratio),
       latencyP50Ms: metricDelta(current.metrics.latencyMs.p50, baseline.metrics.latencyMs.p50),
       latencyP95Ms: metricDelta(current.metrics.latencyMs.p95, baseline.metrics.latencyMs.p95),
       latencyP99Ms: metricDelta(current.metrics.latencyMs.p99, baseline.metrics.latencyMs.p99),
 
@@ -39,6 +39,13 @@ function uniqueResultsByPath(results: PerQueryEvalResult["results"]): PerQueryEv
   return unique;
 }
 
+function distinctTopKRatio(results: PerQueryEvalResult["results"], k: number): number {
+  const top = results.slice(0, k);
+  if (top.length === 0) return 0;
+  const distinct = new Set(top.map((result) => normalizePath(result.filePath))).size;
+  return distinct / top.length;
+}
+
 export function pathMatchesExpected(actualPath: string, expectedPath: string): boolean {
   const actual = normalizePath(actualPath);
   const expected = normalizePath(expectedPath);
@@ -149,6 +156,7 @@ export function buildPerQueryResult(
     reciprocalRankAt10: reciprocalRankAtK(deduped, relevantPaths, 10),
     ndcgAt10: ndcgAtK(deduped, relevantPaths, 10),
     failureBucket: classifyFailureBucket(query, results, k),
+    rawTop3DistinctRatio: distinctTopKRatio(results, 3),
     results: deduped,
   };
 
@@ -172,6 +180,8 @@ export function computeEvalMetrics(
     hitAt10: 0,
     mrrAt10: 0,
     ndcgAt10: 0,
+    distinctTop3Ratio: 0,
+    rawDistinctTop3Ratio: 0,
   };
 
   const failureBuckets: Record<FailureBucket, number> = {
@@ -190,6 +200,8 @@ export function computeEvalMetrics(
     if (query.hitAt10) sum.hitAt10 += 1;
     sum.mrrAt10 += query.reciprocalRankAt10;
     sum.ndcgAt10 += query.ndcgAt10;
+    sum.distinctTop3Ratio += distinctTopKRatio(query.results, 3);
+    sum.rawDistinctTop3Ratio += query.rawTop3DistinctRatio;
     if (query.failureBucket) {
       failureBuckets[query.failureBucket] += 1;
     }
@@ -204,6 +216,8 @@ export function computeEvalMetrics(
     hitAt10: safeDiv(sum.hitAt10),
     mrrAt10: safeDiv(sum.mrrAt10),
     ndcgAt10: safeDiv(sum.ndcgAt10),
+    distinctTop3Ratio: safeDiv(sum.distinctTop3Ratio),
+    rawDistinctTop3Ratio: safeDiv(sum.rawDistinctTop3Ratio),
     latencyMs: {
       p50: percentile(latencies, 0.5),
       p95: percentile(latencies, 0.95),
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`"generatedAt": "2026-03-13T14:21:43.213Z",`
`3`	`3`	`"queryCount": 3,`
`4`	`4`	`"hitAt5": 1,`
	`5`	`+ "distinctTop3Ratio": 0.9166666666666666,`
`5`	`6`	`"medianMs": 0.010916000000008808,`
`6`	`7`	`"p95Ms": 0.024666000000024724`
`7`	`8`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,9 +5,11 @@`
`5`	`5`	`"thresholds": {`
`6`	`6`	`"hitAt5MaxDrop": 0.03,`
`7`	`7`	`"mrrAt10MaxDrop": 0.03,`
	`8`	`+ "rawDistinctTop3RatioMaxDrop": 0.1,`
`8`	`9`	`"p95LatencyMaxMultiplier": 1.35,`
`9`	`10`	`"p95LatencyMaxAbsoluteMs": 4000,`
`10`	`11`	`"minHitAt5": 0.4,`
`11`		`- "minMrrAt10": 0.25`
	`12`	`+ "minMrrAt10": 0.25,`
	`13`	`+ "minRawDistinctTop3Ratio": 0.5`
`12`	`14`	`}`
`13`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`	`"thresholds": {`
`5`	`5`	`"minHitAt5": 0.5,`
`6`	`6`	`"minMrrAt10": 0.45,`
	`7`	`+ "minRawDistinctTop3Ratio": 0.5,`
`7`	`8`	`"p95LatencyMaxAbsoluteMs": 500`
`8`	`9`	`}`
`9`	`10`	`}`