Skip to content

Commit 49a35ae

Browse files
committed
feat: implement batch operations for embeddings and chunks in the database
1 parent 9f94822 commit 49a35ae

File tree

4 files changed

+215
-21
lines changed

4 files changed

+215
-21
lines changed

AGENTS.md

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# AGENTS.md - AI Agent Guidelines for opencode-codebase-index
22

3-
**Generated:** 2026-01-16 | **Commit:** d02b915 | **Branch:** main
3+
**Generated:** 2025-01-16 | **Commit:** 9f94822 | **Branch:** main
44

55
Semantic codebase indexing plugin for OpenCode. Hybrid TypeScript/Rust architecture:
66
- **TypeScript** (`src/`): Plugin logic, embedding providers, OpenCode tools
@@ -90,10 +90,20 @@ skill/ # OpenCode skill guidance
9090
| `parse_files` | fn | Parallel multi-file parsing |
9191
| `hash_content` | fn | xxhash string |
9292
| `hash_file` | fn | xxhash file contents |
93-
| `VectorStore` | class | usearch wrapper (add/search/save/load) |
94-
| `Database` | class | SQLite: embeddings, chunks, branches, metadata |
93+
| `VectorStore` | class | usearch wrapper (add/addBatch/search/save/load) |
94+
| `Database` | class | SQLite: embeddings, chunks, branches, metadata (includes batch methods) |
9595
| `InvertedIndex` | class | BM25 keyword search |
9696

97+
### Database Batch Methods
98+
The `Database` class exposes batch operations for high-performance bulk inserts:
99+
| Method | Purpose | Speedup |
100+
|--------|---------|---------|
101+
| `upsertEmbeddingsBatch` | Batch insert embeddings in single transaction | ~1.3x |
102+
| `upsertChunksBatch` | Batch insert chunks in single transaction | ~12x |
103+
| `addChunksToBranchBatch` | Batch add chunks to branch in single transaction | ~18x |
104+
105+
These are used by the Indexer for all bulk operations. Prefer batch methods over sequential calls.
106+
97107
## CONVENTIONS
98108

99109
### Import Rules (CRITICAL - causes runtime errors if wrong)
@@ -180,10 +190,20 @@ afterEach(() => { fs.rmSync(tempDir, { recursive: true, force: true }); });
180190
| File | Tests |
181191
|------|-------|
182192
| `native.test.ts` | Rust bindings: parsing, vectors, hashing |
183-
| `database.test.ts` | SQLite: CRUD, branches, GC |
193+
| `database.test.ts` | SQLite: CRUD, branches, GC, batch operations |
184194
| `inverted-index.test.ts` | BM25 keyword search |
185195
| `files.test.ts` | File collection, .gitignore |
186196
| `cost.test.ts` | Token estimation |
197+
| `watcher.test.ts` | File/git branch watching |
198+
| `auto-gc.test.ts` | Automatic garbage collection |
199+
| `git.test.ts` | Git branch detection |
200+
201+
### Benchmarks
202+
```bash
203+
npx tsx benchmarks/run.ts # Performance testing for native operations
204+
```
205+
206+
Tests batch vs sequential performance for VectorStore and SQLite operations.
187207

188208
## CONFIGURATION
189209

src/indexer/index.ts

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ export class Indexer {
297297

298298
const allMetadata = this.store.getAllMetadata();
299299
const chunkIds: string[] = [];
300+
const chunkDataBatch: ChunkData[] = [];
300301

301302
for (const { key, metadata } of allMetadata) {
302303
const chunkData: ChunkData = {
@@ -309,11 +310,14 @@ export class Indexer {
309310
name: metadata.name,
310311
language: metadata.language,
311312
};
312-
this.database.upsertChunk(chunkData);
313+
chunkDataBatch.push(chunkData);
313314
chunkIds.push(key);
314315
}
315316

316-
this.database.addChunksToBranch(this.currentBranch || "default", chunkIds);
317+
if (chunkDataBatch.length > 0) {
318+
this.database.upsertChunksBatch(chunkDataBatch);
319+
}
320+
this.database.addChunksToBranchBatch(this.currentBranch || "default", chunkIds);
317321
}
318322

319323
private async ensureInitialized(): Promise<{
@@ -434,6 +438,8 @@ export class Indexer {
434438
}
435439
}
436440

441+
const chunkDataBatch: ChunkData[] = [];
442+
437443
for (const parsed of parsedFiles) {
438444
currentFilePaths.add(parsed.path);
439445

@@ -466,7 +472,7 @@ export class Indexer {
466472
name: chunk.name,
467473
language: chunk.language,
468474
};
469-
database.upsertChunk(chunkData);
475+
chunkDataBatch.push(chunkData);
470476

471477
if (existingChunks.get(id) === contentHash) {
472478
fileChunkCount++;
@@ -489,6 +495,10 @@ export class Indexer {
489495
}
490496
}
491497

498+
if (chunkDataBatch.length > 0) {
499+
database.upsertChunksBatch(chunkDataBatch);
500+
}
501+
492502
let removedCount = 0;
493503
for (const [chunkId] of existingChunks) {
494504
if (!currentChunkIds.has(chunkId)) {
@@ -504,7 +514,7 @@ export class Indexer {
504514

505515
if (pendingChunks.length === 0 && removedCount === 0) {
506516
database.clearBranch(this.currentBranch);
507-
database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
517+
database.addChunksToBranchBatch(this.currentBranch, Array.from(currentChunkIds));
508518
this.fileHashCache = currentFileHashes;
509519
this.saveFileHashCache();
510520
stats.durationMs = Date.now() - startTime;
@@ -520,7 +530,7 @@ export class Indexer {
520530

521531
if (pendingChunks.length === 0) {
522532
database.clearBranch(this.currentBranch);
523-
database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
533+
database.addChunksToBranchBatch(this.currentBranch, Array.from(currentChunkIds));
524534
store.save();
525535
invertedIndex.save();
526536
this.fileHashCache = currentFileHashes;
@@ -615,17 +625,15 @@ export class Indexer {
615625

616626
store.addBatch(items);
617627

618-
for (let i = 0; i < batch.length; i++) {
619-
const chunk = batch[i];
620-
const embedding = result.embeddings[i];
621-
622-
database.upsertEmbedding(
623-
chunk.contentHash,
624-
float32ArrayToBuffer(embedding),
625-
chunk.text,
626-
detectedProvider.modelInfo.model
627-
);
628-
628+
const embeddingBatchItems = batch.map((chunk, i) => ({
629+
contentHash: chunk.contentHash,
630+
embedding: float32ArrayToBuffer(result.embeddings[i]),
631+
chunkText: chunk.text,
632+
model: detectedProvider.modelInfo.model,
633+
}));
634+
database.upsertEmbeddingsBatch(embeddingBatchItems);
635+
636+
for (const chunk of batch) {
629637
invertedIndex.removeChunk(chunk.id);
630638
invertedIndex.addChunk(chunk.id, chunk.content);
631639
}
@@ -659,7 +667,7 @@ export class Indexer {
659667
});
660668

661669
database.clearBranch(this.currentBranch);
662-
database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
670+
database.addChunksToBranchBatch(this.currentBranch, Array.from(currentChunkIds));
663671

664672
store.save();
665673
invertedIndex.save();

src/native/index.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,18 @@ export class Database {
542542
this.inner.upsertEmbedding(contentHash, embedding, chunkText, model);
543543
}
544544

545+
upsertEmbeddingsBatch(
546+
items: Array<{
547+
contentHash: string;
548+
embedding: Buffer;
549+
chunkText: string;
550+
model: string;
551+
}>
552+
): void {
553+
if (items.length === 0) return;
554+
this.inner.upsertEmbeddingsBatch(items);
555+
}
556+
545557
getMissingEmbeddings(contentHashes: string[]): string[] {
546558
return this.inner.getMissingEmbeddings(contentHashes);
547559
}
@@ -550,6 +562,11 @@ export class Database {
550562
this.inner.upsertChunk(chunk);
551563
}
552564

565+
upsertChunksBatch(chunks: ChunkData[]): void {
566+
if (chunks.length === 0) return;
567+
this.inner.upsertChunksBatch(chunks);
568+
}
569+
553570
getChunk(chunkId: string): ChunkData | null {
554571
return this.inner.getChunk(chunkId) ?? null;
555572
}
@@ -566,6 +583,11 @@ export class Database {
566583
this.inner.addChunksToBranch(branch, chunkIds);
567584
}
568585

586+
addChunksToBranchBatch(branch: string, chunkIds: string[]): void {
587+
if (chunkIds.length === 0) return;
588+
this.inner.addChunksToBranchBatch(branch, chunkIds);
589+
}
590+
569591
clearBranch(branch: string): number {
570592
return this.inner.clearBranch(branch);
571593
}

tests/database.test.ts

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,4 +293,148 @@ describe("Database", () => {
293293
expect(stats.branchCount).toBe(1);
294294
});
295295
});
296+
297+
describe("batch operations", () => {
298+
it("should upsert embeddings in batch", () => {
299+
const items = [
300+
{
301+
contentHash: "batch_hash1",
302+
embedding: Buffer.from(new Float32Array([1.0, 2.0]).buffer),
303+
chunkText: "text1",
304+
model: "test-model",
305+
},
306+
{
307+
contentHash: "batch_hash2",
308+
embedding: Buffer.from(new Float32Array([3.0, 4.0]).buffer),
309+
chunkText: "text2",
310+
model: "test-model",
311+
},
312+
{
313+
contentHash: "batch_hash3",
314+
embedding: Buffer.from(new Float32Array([5.0, 6.0]).buffer),
315+
chunkText: "text3",
316+
model: "test-model",
317+
},
318+
];
319+
320+
db.upsertEmbeddingsBatch(items);
321+
322+
expect(db.embeddingExists("batch_hash1")).toBe(true);
323+
expect(db.embeddingExists("batch_hash2")).toBe(true);
324+
expect(db.embeddingExists("batch_hash3")).toBe(true);
325+
326+
const retrieved = db.getEmbedding("batch_hash2");
327+
expect(retrieved).not.toBeNull();
328+
const floats = new Float32Array(retrieved!.buffer, retrieved!.byteOffset, retrieved!.byteLength / 4);
329+
expect(floats[0]).toBeCloseTo(3.0);
330+
expect(floats[1]).toBeCloseTo(4.0);
331+
});
332+
333+
it("should handle empty embeddings batch", () => {
334+
db.upsertEmbeddingsBatch([]);
335+
expect(db.getStats().embeddingCount).toBe(0);
336+
});
337+
338+
it("should upsert chunks in batch", () => {
339+
const chunks: ChunkData[] = [
340+
{
341+
chunkId: "batch_chunk1",
342+
contentHash: "hash1",
343+
filePath: "/file1.ts",
344+
startLine: 1,
345+
endLine: 10,
346+
nodeType: "function",
347+
name: "func1",
348+
language: "typescript",
349+
},
350+
{
351+
chunkId: "batch_chunk2",
352+
contentHash: "hash2",
353+
filePath: "/file2.ts",
354+
startLine: 20,
355+
endLine: 30,
356+
nodeType: "class",
357+
name: "MyClass",
358+
language: "typescript",
359+
},
360+
{
361+
chunkId: "batch_chunk3",
362+
contentHash: "hash3",
363+
filePath: "/file1.ts",
364+
startLine: 50,
365+
endLine: 60,
366+
language: "typescript",
367+
},
368+
];
369+
370+
db.upsertChunksBatch(chunks);
371+
372+
const chunk1 = db.getChunk("batch_chunk1");
373+
expect(chunk1).not.toBeNull();
374+
expect(chunk1!.filePath).toBe("/file1.ts");
375+
expect(chunk1!.name).toBe("func1");
376+
377+
const chunk2 = db.getChunk("batch_chunk2");
378+
expect(chunk2).not.toBeNull();
379+
expect(chunk2!.nodeType).toBe("class");
380+
381+
const chunk3 = db.getChunk("batch_chunk3");
382+
expect(chunk3).not.toBeNull();
383+
384+
const file1Chunks = db.getChunksByFile("/file1.ts");
385+
expect(file1Chunks.length).toBe(2);
386+
});
387+
388+
it("should handle empty chunks batch", () => {
389+
db.upsertChunksBatch([]);
390+
expect(db.getStats().chunkCount).toBe(0);
391+
});
392+
393+
it("should add chunks to branch in batch", () => {
394+
const chunks: ChunkData[] = [
395+
{ chunkId: "c1", contentHash: "h1", filePath: "/f.ts", startLine: 1, endLine: 5, language: "ts" },
396+
{ chunkId: "c2", contentHash: "h2", filePath: "/f.ts", startLine: 10, endLine: 15, language: "ts" },
397+
{ chunkId: "c3", contentHash: "h3", filePath: "/f.ts", startLine: 20, endLine: 25, language: "ts" },
398+
];
399+
db.upsertChunksBatch(chunks);
400+
401+
db.addChunksToBranchBatch("feature-branch", ["c1", "c2", "c3"]);
402+
403+
const branchChunks = db.getBranchChunkIds("feature-branch");
404+
expect(branchChunks.length).toBe(3);
405+
expect(branchChunks).toContain("c1");
406+
expect(branchChunks).toContain("c2");
407+
expect(branchChunks).toContain("c3");
408+
});
409+
410+
it("should handle empty branch batch", () => {
411+
db.addChunksToBranchBatch("empty-branch", []);
412+
expect(db.getBranchChunkIds("empty-branch").length).toBe(0);
413+
});
414+
415+
it("should update existing chunks in batch", () => {
416+
db.upsertChunk({
417+
chunkId: "update_chunk",
418+
contentHash: "old_hash",
419+
filePath: "/old.ts",
420+
startLine: 1,
421+
endLine: 5,
422+
language: "typescript",
423+
});
424+
425+
db.upsertChunksBatch([{
426+
chunkId: "update_chunk",
427+
contentHash: "new_hash",
428+
filePath: "/new.ts",
429+
startLine: 10,
430+
endLine: 20,
431+
language: "typescript",
432+
}]);
433+
434+
const chunk = db.getChunk("update_chunk");
435+
expect(chunk!.contentHash).toBe("new_hash");
436+
expect(chunk!.filePath).toBe("/new.ts");
437+
expect(chunk!.startLine).toBe(10);
438+
});
439+
});
296440
});

0 commit comments

Comments
 (0)