Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 16 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -532,18 +532,14 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
"rerankTopN": 20, // Deterministic rerank depth
"contextLines": 0 // Extra lines before/after match
},

// === Reranking API ===
"reranker": {
"enabled": true, // Enable API reranking
"baseUrl": "https://api.siliconflow.cn/v1",
"model": "BAAI/bge-reranker-v2-m3",
"apiKey": "{env:SILICONFLOW_API_KEY}",
"topN": 20, // Number of results to rerank
"timeoutMs": 30000 // Request timeout (ms)
"enabled": false,
"provider": "cohere",
"model": "rerank-v3.5",
"apiKey": "{env:RERANK_API_KEY}",
"topN": 15,
"timeoutMs": 10000
},

// === Debug ===
"debug": {
"enabled": false, // Enable debug logging
"logLevel": "info", // error | warn | info | debug
Expand Down Expand Up @@ -604,23 +600,14 @@ String values in `codebase-index.json` can reference environment variables with
| `rrfK` | `60` | RRF smoothing constant. Higher values flatten rank impact, lower values prioritize top-ranked candidates more strongly |
| `rerankTopN` | `20` | Deterministic rerank depth cap. Applies lightweight name/path/chunk-type rerank to top-N only |
| `contextLines` | `0` | Extra lines to include before/after each match |
| **reranker** | | |
| `reranker.enabled` | `false` | Enable API-based reranking |
| `reranker.baseUrl` | - | Rerank API endpoint URL |
| `reranker.model` | - | Reranking model name (e.g. `BAAI/bge-reranker-v2-m3`) |
| `reranker.apiKey` | - | API key for reranking service (use `{env:VAR}` for security) |
| `reranker.topN` | `20` | Number of top results to rerank via API |
| `reranker.timeoutMs` | `30000` | Rerank API request timeout in milliseconds |
| **customProvider** | | |
| `customProvider.baseUrl` | - | Base URL of OpenAI-compatible embeddings API (e.g. `https://api.siliconflow.cn/v1`) |
| `customProvider.model` | - | Model name (e.g. `BAAI/bge-m3`, `nomic-embed-text`) |
| `customProvider.dimensions` | - | Vector dimensions (e.g. `1024` for BGE-M3, `768` for nomic-embed-text) |
| `customProvider.apiKey` | - | API key (use `{env:VAR}` for security) |
| `customProvider.maxTokens` | `8192` | Max tokens per input text |
| `customProvider.timeoutMs` | `30000` | Request timeout in milliseconds |
| `customProvider.concurrency` | `3` | Max concurrent embedding requests |
| `customProvider.requestIntervalMs` | `1000` | Minimum delay between requests (ms). Set to `0` for local servers |
| `customProvider.maxBatchSize` | - | Max inputs per `/embeddings` request. Cap for servers with batch limits |
| **reranker** | | Optional second-stage model reranker for the top candidate pool |
| `enabled` | `false` | Turn external reranking on/off |
| `provider` | `"custom"` | Hosted shortcuts: `cohere`, `jina`, or `custom` |
| `model` | — | Reranker model name required when enabled |
| `baseUrl` | provider default | Override reranker endpoint base URL. `cohere` → `https://api.cohere.ai/v1`, `jina` → `https://api.jina.ai/v1` |
| `apiKey` | — | API key for hosted reranker providers |
| `topN` | `15` | Number of top candidates to send to the external reranker |
| `timeoutMs` | `10000` | Timeout for external rerank requests |
| **debug** | | |
| `enabled` | `false` | Enable debug logging and metrics collection |
| `logLevel` | `"info"` | Log level: `error`, `warn`, `info`, `debug` |
Expand All @@ -633,9 +620,10 @@ String values in `codebase-index.json` can reference environment variables with

### Retrieval ranking behavior

- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → filtering.
- `codebase_search` and `codebase_peek` use the hybrid path: semantic + keyword retrieval → fusion (`fusionStrategy`) → deterministic rerank (`rerankTopN`) → optional external reranker (`reranker`) → filtering.
- `find_similar` stays semantic-only: semantic retrieval + deterministic rerank only (no keyword retrieval, no RRF).
- For compatibility rollbacks, set `search.fusionStrategy` to `"weighted"` to use the legacy weighted fusion path.
- When enabled, the external reranker sees path metadata plus a bounded on-disk code snippet for each candidate so it can distinguish real implementations from docs/tests more reliably.
- Retrieval benchmark artifacts are separated by role:
- baseline (versioned): `benchmarks/baselines/retrieval-baseline.json`
- latest candidate run (generated): `benchmark-results/retrieval-candidate.json`
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/baselines/eval-baseline-summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
"hitAt10": 1,
"mrrAt10": 0.875,
"ndcgAt10": 0.9127302324517832,
"distinctTop3Ratio": 1,
"rawDistinctTop3Ratio": 1,
"latencyMs": {
"p50": 26.173166000000037,
"p95": 52.931082999999944,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/baselines/retrieval-baseline.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"generatedAt": "2026-03-13T14:21:43.213Z",
"queryCount": 3,
"hitAt5": 1,
"distinctTop3Ratio": 0.9166666666666666,
"medianMs": 0.010916000000008808,
"p95Ms": 0.024666000000024724
}
4 changes: 3 additions & 1 deletion benchmarks/budgets/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
"thresholds": {
"hitAt5MaxDrop": 0.03,
"mrrAt10MaxDrop": 0.03,
"rawDistinctTop3RatioMaxDrop": 0.1,
"p95LatencyMaxMultiplier": 1.35,
"p95LatencyMaxAbsoluteMs": 4000,
"minHitAt5": 0.4,
"minMrrAt10": 0.25
"minMrrAt10": 0.25,
"minRawDistinctTop3Ratio": 0.5
}
}
1 change: 1 addition & 0 deletions benchmarks/budgets/github-models.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"thresholds": {
"minHitAt5": 0.5,
"minMrrAt10": 0.45,
"minRawDistinctTop3Ratio": 0.5,
"p95LatencyMaxAbsoluteMs": 500
}
}
106 changes: 69 additions & 37 deletions src/config/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,25 @@ export interface SearchConfig {
contextLines: number;
}

export type RerankerProvider = "cohere" | "jina" | "custom";

export interface RerankerConfig {
/** Whether to enable reranking. Default: false */
enabled: boolean;
/** Provider shortcut for hosted rerank APIs. Use 'custom' to provide only baseUrl. */
provider: RerankerProvider;
/** Model name for reranking */
model: string;
/** Base URL of the rerank API endpoint */
baseUrl: string;
/** API key for the rerank service */
apiKey?: string;
/** Number of top documents to rerank */
topN: number;
/** Request timeout in milliseconds */
timeoutMs: number;
}

export type LogLevel = "error" | "warn" | "info" | "debug";

export interface DebugConfig {
Expand Down Expand Up @@ -83,21 +102,6 @@ export interface CustomProviderConfig {
max_batch_size?: number;
}

export interface RerankerConfig {
/** Whether to enable reranking. Default: false */
enabled: boolean;
/** Base URL of the rerank API endpoint (e.g. "https://api.siliconflow.cn/v1") */
baseUrl: string;
/** Model name for reranking (e.g. "BAAI/bge-reranker-v2-m3") */
model: string;
/** API key for the rerank service */
apiKey?: string;
/** Number of top documents to rerank. Default: 20 */
topN?: number;
/** Request timeout in milliseconds. Default: 30000 */
timeoutMs?: number;
}

export interface CodebaseIndexConfig {
embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
embeddingModel?: EmbeddingModelName;
Expand All @@ -123,7 +127,7 @@ export type ParsedCodebaseIndexConfig = CodebaseIndexConfig & {
indexing: IndexingConfig;
search: SearchConfig;
debug: DebugConfig;
reranker: RerankerConfig;
reranker?: RerankerConfig;
knowledgeBases: string[];
additionalInclude: string[];
};
Expand Down Expand Up @@ -164,6 +168,21 @@ function isValidFusionStrategy(value: unknown): value is SearchConfig["fusionStr
return value === "weighted" || value === "rrf";
}

function isValidRerankerProvider(value: unknown): value is RerankerProvider {
return value === "cohere" || value === "jina" || value === "custom";
}

function getDefaultRerankerBaseUrl(provider: RerankerProvider): string {
switch (provider) {
case "cohere":
return "https://api.cohere.ai/v1";
case "jina":
return "https://api.jina.ai/v1";
case "custom":
return "";
}
}

function getDefaultDebugConfig(): DebugConfig {
return {
enabled: false,
Expand All @@ -177,16 +196,6 @@ function getDefaultDebugConfig(): DebugConfig {
};
}

function getDefaultRerankerConfig(): RerankerConfig {
return {
enabled: false,
baseUrl: "https://api.siliconflow.cn/v1",
model: "BAAI/bge-reranker-v2-m3",
topN: 20,
timeoutMs: 30000,
};
}

const VALID_SCOPES: IndexScope[] = ["project", "global"];
const VALID_LOG_LEVELS: LogLevel[] = ["error", "warn", "info", "debug"];

Expand Down Expand Up @@ -282,17 +291,6 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
metrics: typeof rawDebug.metrics === "boolean" ? rawDebug.metrics : defaultDebug.metrics,
};

const defaultReranker = getDefaultRerankerConfig();
const rawReranker = (input.reranker && typeof input.reranker === "object" ? input.reranker : {}) as Record<string, unknown>;
const reranker: RerankerConfig = {
enabled: typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : defaultReranker.enabled,
baseUrl: typeof rawReranker.baseUrl === "string" ? rawReranker.baseUrl.trim().replace(/\/+$/, '') : defaultReranker.baseUrl,
model: typeof rawReranker.model === "string" ? rawReranker.model : defaultReranker.model,
apiKey: getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey"),
topN: typeof rawReranker.topN === "number" ? Math.max(1, Math.min(200, Math.floor(rawReranker.topN))) : defaultReranker.topN,
timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, rawReranker.timeoutMs) : defaultReranker.timeoutMs,
};

const rawKnowledgeBases = input.knowledgeBases;
const knowledgeBases: string[] = isStringArray(rawKnowledgeBases)
? rawKnowledgeBases.filter(p => typeof p === "string" && p.trim().length > 0).map(p => p.trim())
Expand All @@ -306,6 +304,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
let embeddingProvider: EmbeddingProvider | 'custom' | 'auto';
let embeddingModel: EmbeddingModelName | undefined = undefined;
let customProvider: CustomProviderConfig | undefined = undefined;
let reranker: RerankerConfig | undefined = undefined;

if (embeddingProviderValue === 'custom') {
embeddingProvider = 'custom';
Expand Down Expand Up @@ -359,6 +358,39 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
embeddingProvider = 'auto';
}

const rawReranker = (input.reranker && typeof input.reranker === "object"
? input.reranker
: {}) as Record<string, unknown>;
const rerankerEnabled = typeof rawReranker.enabled === "boolean" ? rawReranker.enabled : false;
if (rerankerEnabled) {
const provider = isValidRerankerProvider(rawReranker.provider) ? rawReranker.provider : "custom";
const model = getResolvedString(rawReranker.model, "$root.reranker.model");
if (!model || model.trim().length === 0) {
throw new Error("reranker is enabled but reranker.model is missing or invalid.");
}

const configuredBaseUrl = getResolvedString(rawReranker.baseUrl, "$root.reranker.baseUrl");
const baseUrl = configuredBaseUrl?.trim() || getDefaultRerankerBaseUrl(provider);
if (baseUrl.length === 0) {
throw new Error("reranker is enabled but reranker.baseUrl is missing or invalid for provider 'custom'.");
}

const apiKey = getResolvedString(rawReranker.apiKey, "$root.reranker.apiKey");
if ((provider === "cohere" || provider === "jina") && (!apiKey || apiKey.trim().length === 0)) {
throw new Error(`reranker provider '${provider}' requires reranker.apiKey when enabled.`);
}

reranker = {
enabled: true,
provider,
model: model.trim(),
baseUrl: baseUrl.replace(/\/+$/, ""),
apiKey: apiKey?.trim() || undefined,
topN: typeof rawReranker.topN === "number" ? Math.min(50, Math.max(1, Math.floor(rawReranker.topN))) : 15,
timeoutMs: typeof rawReranker.timeoutMs === "number" ? Math.max(1000, Math.floor(rawReranker.timeoutMs)) : 10000,
};
}

return {
embeddingProvider,
embeddingModel,
Expand Down
20 changes: 20 additions & 0 deletions src/eval/budget.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ export function evaluateBudgetGate(
});
}

if (
thresholds.minRawDistinctTop3Ratio !== undefined &&
summary.metrics.rawDistinctTop3Ratio < thresholds.minRawDistinctTop3Ratio
) {
violations.push({
metric: "minRawDistinctTop3Ratio",
message: `Raw Distinct Top@3 ${summary.metrics.rawDistinctTop3Ratio.toFixed(4)} is below minimum ${thresholds.minRawDistinctTop3Ratio.toFixed(4)}`,
});
}

if (comparison) {
if (
thresholds.hitAt5MaxDrop !== undefined &&
Expand All @@ -45,6 +55,16 @@ export function evaluateBudgetGate(
});
}

if (
thresholds.rawDistinctTop3RatioMaxDrop !== undefined &&
comparison.deltas.rawDistinctTop3Ratio.absolute < -thresholds.rawDistinctTop3RatioMaxDrop
) {
violations.push({
metric: "rawDistinctTop3RatioMaxDrop",
message: `Raw Distinct Top@3 drop ${comparison.deltas.rawDistinctTop3Ratio.absolute.toFixed(4)} exceeds allowed -${thresholds.rawDistinctTop3RatioMaxDrop.toFixed(4)}`,
});
}

if (thresholds.p95LatencyMaxMultiplier !== undefined) {
const baselineP95 = comparison.deltas.latencyP95Ms.baseline;
if (baselineP95 > BASELINE_P95_EPSILON_MS) {
Expand Down
8 changes: 6 additions & 2 deletions src/eval/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,12 @@ export async function handleEvalCommand(args: string[], cwd: string): Promise<nu
if (!parsed.againstPath.endsWith(".json")) {
throw new Error("eval diff --against must point to a summary JSON file");
}
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath));
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath));
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath), {
allowLegacyDiversityMetrics: true,
});
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath), {
allowLegacyDiversityMetrics: true,
});
const comparison = compareSummaries(
currentSummary,
baselineSummary,
Expand Down
2 changes: 2 additions & 0 deletions src/eval/compare.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ export function compareSummaries(current: EvalSummary, baseline: EvalSummary, ag
hitAt10: metricDelta(current.metrics.hitAt10, baseline.metrics.hitAt10),
mrrAt10: metricDelta(current.metrics.mrrAt10, baseline.metrics.mrrAt10),
ndcgAt10: metricDelta(current.metrics.ndcgAt10, baseline.metrics.ndcgAt10),
distinctTop3Ratio: metricDelta(current.metrics.distinctTop3Ratio, baseline.metrics.distinctTop3Ratio),
rawDistinctTop3Ratio: metricDelta(current.metrics.rawDistinctTop3Ratio, baseline.metrics.rawDistinctTop3Ratio),
latencyP50Ms: metricDelta(current.metrics.latencyMs.p50, baseline.metrics.latencyMs.p50),
latencyP95Ms: metricDelta(current.metrics.latencyMs.p95, baseline.metrics.latencyMs.p95),
latencyP99Ms: metricDelta(current.metrics.latencyMs.p99, baseline.metrics.latencyMs.p99),
Expand Down
14 changes: 14 additions & 0 deletions src/eval/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ function uniqueResultsByPath(results: PerQueryEvalResult["results"]): PerQueryEv
return unique;
}

function distinctTopKRatio(results: PerQueryEvalResult["results"], k: number): number {
const top = results.slice(0, k);
if (top.length === 0) return 0;
const distinct = new Set(top.map((result) => normalizePath(result.filePath))).size;
return distinct / top.length;
}

export function pathMatchesExpected(actualPath: string, expectedPath: string): boolean {
const actual = normalizePath(actualPath);
const expected = normalizePath(expectedPath);
Expand Down Expand Up @@ -149,6 +156,7 @@ export function buildPerQueryResult(
reciprocalRankAt10: reciprocalRankAtK(deduped, relevantPaths, 10),
ndcgAt10: ndcgAtK(deduped, relevantPaths, 10),
failureBucket: classifyFailureBucket(query, results, k),
rawTop3DistinctRatio: distinctTopKRatio(results, 3),
results: deduped,
};

Expand All @@ -172,6 +180,8 @@ export function computeEvalMetrics(
hitAt10: 0,
mrrAt10: 0,
ndcgAt10: 0,
distinctTop3Ratio: 0,
rawDistinctTop3Ratio: 0,
};

const failureBuckets: Record<FailureBucket, number> = {
Expand All @@ -190,6 +200,8 @@ export function computeEvalMetrics(
if (query.hitAt10) sum.hitAt10 += 1;
sum.mrrAt10 += query.reciprocalRankAt10;
sum.ndcgAt10 += query.ndcgAt10;
sum.distinctTop3Ratio += distinctTopKRatio(query.results, 3);
sum.rawDistinctTop3Ratio += query.rawTop3DistinctRatio;
if (query.failureBucket) {
failureBuckets[query.failureBucket] += 1;
}
Expand All @@ -204,6 +216,8 @@ export function computeEvalMetrics(
hitAt10: safeDiv(sum.hitAt10),
mrrAt10: safeDiv(sum.mrrAt10),
ndcgAt10: safeDiv(sum.ndcgAt10),
distinctTop3Ratio: safeDiv(sum.distinctTop3Ratio),
rawDistinctTop3Ratio: safeDiv(sum.rawDistinctTop3Ratio),
latencyMs: {
p50: percentile(latencies, 0.5),
p95: percentile(latencies, 0.95),
Expand Down
Loading
Loading