Skip to content

Commit 9f94822

Browse files
committed
feat: add batch operations for embeddings and chunks in the database
1 parent ada623b commit 9f94822

File tree

3 files changed

+231
-6
lines changed

3 files changed

+231
-6
lines changed

benchmarks/run.ts

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,15 @@ class Database {
140140
this.inner.upsertEmbedding(contentHash, embedding, chunkText, model);
141141
}
142142

143+
upsertEmbeddingsBatch(items: Array<{
144+
contentHash: string;
145+
embedding: Buffer;
146+
chunkText: string;
147+
model: string;
148+
}>): void {
149+
this.inner.upsertEmbeddingsBatch(items);
150+
}
151+
143152
upsertChunk(data: {
144153
chunkId: string;
145154
contentHash: string;
@@ -153,10 +162,27 @@ class Database {
153162
this.inner.upsertChunk(data);
154163
}
155164

165+
upsertChunksBatch(chunks: Array<{
166+
chunkId: string;
167+
contentHash: string;
168+
filePath: string;
169+
startLine: number;
170+
endLine: number;
171+
nodeType?: string;
172+
name?: string;
173+
language: string;
174+
}>): void {
175+
this.inner.upsertChunksBatch(chunks);
176+
}
177+
156178
addChunksToBranch(branch: string, chunkIds: string[]): void {
157179
this.inner.addChunksToBranch(branch, chunkIds);
158180
}
159181

182+
addChunksToBranchBatch(branch: string, chunkIds: string[]): void {
183+
this.inner.addChunksToBranchBatch(branch, chunkIds);
184+
}
185+
160186
getBranchChunkIds(branch: string): string[] {
161187
return this.inner.getBranchChunkIds(branch);
162188
}
@@ -507,18 +533,19 @@ async function runBenchmarks(): Promise<void> {
507533
// ============================================
508534
console.log("\n=== Database Performance (SQLite) ===");
509535

510-
const dbPath = path.join(tempDir, "benchmark.db");
511-
const db = new Database(dbPath);
512-
513536
const chunkCounts = [100, 1000, 5000, 10000];
514537

515538
for (const chunkCount of chunkCounts) {
539+
const dbPath = path.join(tempDir, `benchmark-${chunkCount}.db`);
540+
const db = new Database(dbPath);
541+
const dbBatch = new Database(path.join(tempDir, `benchmark-batch-${chunkCount}.db`));
542+
516543
const embedding = Buffer.from(
517544
new Float32Array(generateRandomEmbedding(dimensions)).buffer
518545
);
519546

520547
benchmark(
521-
`Insert ${chunkCount} embeddings`,
548+
`Insert ${chunkCount} embeddings (sequential)`,
522549
() => {
523550
for (let i = 0; i < chunkCount; i++) {
524551
db.upsertEmbedding(`hash-${chunkCount}-${i}`, embedding, `text-${i}`, "test-model");
@@ -528,8 +555,24 @@ async function runBenchmarks(): Promise<void> {
528555
{ embeddings: chunkCount }
529556
);
530557

558+
const embeddingBatchItems = Array.from({ length: chunkCount }, (_, i) => ({
559+
contentHash: `hash-batch-${chunkCount}-${i}`,
560+
embedding,
561+
chunkText: `text-${i}`,
562+
model: "test-model",
563+
}));
564+
531565
benchmark(
532-
`Insert ${chunkCount} chunks`,
566+
`Insert ${chunkCount} embeddings (batch)`,
567+
() => {
568+
dbBatch.upsertEmbeddingsBatch(embeddingBatchItems);
569+
},
570+
1,
571+
{ embeddings: chunkCount }
572+
);
573+
574+
benchmark(
575+
`Insert ${chunkCount} chunks (sequential)`,
533576
() => {
534577
for (let i = 0; i < chunkCount; i++) {
535578
db.upsertChunk({
@@ -548,19 +591,52 @@ async function runBenchmarks(): Promise<void> {
548591
{ chunks: chunkCount }
549592
);
550593

594+
const chunkBatchItems = Array.from({ length: chunkCount }, (_, i) => ({
595+
chunkId: `chunk-batch-${chunkCount}-${i}`,
596+
contentHash: `hash-batch-${chunkCount}-${i}`,
597+
filePath: `/file${i % 100}.ts`,
598+
startLine: i * 10,
599+
endLine: i * 10 + 10,
600+
nodeType: "function",
601+
name: `func${i}`,
602+
language: "typescript",
603+
}));
604+
605+
benchmark(
606+
`Insert ${chunkCount} chunks (batch)`,
607+
() => {
608+
dbBatch.upsertChunksBatch(chunkBatchItems);
609+
},
610+
1,
611+
{ chunks: chunkCount }
612+
);
613+
551614
const chunkIds = Array.from(
552615
{ length: chunkCount },
553616
(_, i) => `chunk-${chunkCount}-${i}`
554617
);
555618
benchmark(
556-
`Add ${chunkCount} chunks to branch`,
619+
`Add ${chunkCount} chunks to branch (sequential)`,
557620
() => {
558621
db.addChunksToBranch(`branch-${chunkCount}`, chunkIds);
559622
},
560623
1,
561624
{ chunks: chunkCount }
562625
);
563626

627+
const chunkIdsBatch = Array.from(
628+
{ length: chunkCount },
629+
(_, i) => `chunk-batch-${chunkCount}-${i}`
630+
);
631+
benchmark(
632+
`Add ${chunkCount} chunks to branch (batch)`,
633+
() => {
634+
dbBatch.addChunksToBranchBatch(`branch-batch-${chunkCount}`, chunkIdsBatch);
635+
},
636+
1,
637+
{ chunks: chunkCount }
638+
);
639+
564640
benchmark(
565641
`Get branch chunk IDs (${chunkCount})`,
566642
() => {

native/src/db.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,35 @@ pub fn upsert_embedding(
150150
Ok(())
151151
}
152152

153+
/// Batch insert or update embeddings within a single transaction
154+
pub fn upsert_embeddings_batch(
155+
conn: &mut Connection,
156+
embeddings: &[(String, Vec<u8>, String, String)],
157+
) -> DbResult<()> {
158+
if embeddings.is_empty() {
159+
return Ok(());
160+
}
161+
162+
let tx = conn.transaction()?;
163+
{
164+
let mut stmt = tx.prepare(
165+
r#"
166+
INSERT INTO embeddings (content_hash, embedding, chunk_text, model, created_at)
167+
VALUES (?, ?, ?, ?, strftime('%s', 'now'))
168+
ON CONFLICT(content_hash) DO UPDATE SET
169+
embedding = excluded.embedding,
170+
model = excluded.model
171+
"#,
172+
)?;
173+
174+
for (content_hash, embedding, chunk_text, model) in embeddings {
175+
stmt.execute(params![content_hash, embedding, chunk_text, model])?;
176+
}
177+
}
178+
tx.commit()?;
179+
Ok(())
180+
}
181+
153182
/// Get multiple embeddings by content hashes
154183
#[allow(dead_code)]
155184
pub fn get_embeddings_batch(
@@ -250,6 +279,49 @@ pub fn upsert_chunk(
250279
Ok(())
251280
}
252281

282+
/// Batch insert or update chunks within a single transaction
283+
pub fn upsert_chunks_batch(
284+
conn: &mut Connection,
285+
chunks: &[ChunkRow],
286+
) -> DbResult<()> {
287+
if chunks.is_empty() {
288+
return Ok(());
289+
}
290+
291+
let tx = conn.transaction()?;
292+
{
293+
let mut stmt = tx.prepare(
294+
r#"
295+
INSERT INTO chunks (chunk_id, content_hash, file_path, start_line, end_line, node_type, name, language)
296+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
297+
ON CONFLICT(chunk_id) DO UPDATE SET
298+
content_hash = excluded.content_hash,
299+
file_path = excluded.file_path,
300+
start_line = excluded.start_line,
301+
end_line = excluded.end_line,
302+
node_type = excluded.node_type,
303+
name = excluded.name,
304+
language = excluded.language
305+
"#,
306+
)?;
307+
308+
for chunk in chunks {
309+
stmt.execute(params![
310+
chunk.chunk_id,
311+
chunk.content_hash,
312+
chunk.file_path,
313+
chunk.start_line,
314+
chunk.end_line,
315+
chunk.node_type,
316+
chunk.name,
317+
chunk.language
318+
])?;
319+
}
320+
}
321+
tx.commit()?;
322+
Ok(())
323+
}
324+
253325
/// Get chunk by ID
254326
pub fn get_chunk(conn: &Connection, chunk_id: &str) -> DbResult<Option<ChunkRow>> {
255327
let result = conn
@@ -344,6 +416,30 @@ pub fn add_chunks_to_branch(conn: &Connection, branch: &str, chunk_ids: &[String
344416
Ok(())
345417
}
346418

419+
/// Batch add chunks to a branch within a single transaction
420+
pub fn add_chunks_to_branch_batch(
421+
conn: &mut Connection,
422+
branch: &str,
423+
chunk_ids: &[String],
424+
) -> DbResult<()> {
425+
if chunk_ids.is_empty() {
426+
return Ok(());
427+
}
428+
429+
let tx = conn.transaction()?;
430+
{
431+
let mut stmt = tx.prepare(
432+
"INSERT OR IGNORE INTO branch_chunks (branch, chunk_id) VALUES (?, ?)",
433+
)?;
434+
435+
for chunk_id in chunk_ids {
436+
stmt.execute(params![branch, chunk_id])?;
437+
}
438+
}
439+
tx.commit()?;
440+
Ok(())
441+
}
442+
347443
/// Remove all chunks from a branch (for re-indexing)
348444
pub fn clear_branch(conn: &Connection, branch: &str) -> DbResult<usize> {
349445
let count = conn.execute(

native/src/lib.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,14 @@ pub struct BranchDelta {
263263
pub removed: Vec<String>,
264264
}
265265

266+
#[napi(object)]
267+
pub struct EmbeddingBatchItem {
268+
pub content_hash: String,
269+
pub embedding: Buffer,
270+
pub chunk_text: String,
271+
pub model: String,
272+
}
273+
266274
#[napi(object)]
267275
pub struct DatabaseStats {
268276
pub embedding_count: u32,
@@ -385,6 +393,51 @@ impl Database {
385393
.map_err(|e| Error::from_reason(e.to_string()))
386394
}
387395

396+
#[napi]
397+
pub fn upsert_embeddings_batch(&self, items: Vec<EmbeddingBatchItem>) -> Result<()> {
398+
let mut conn = self.conn.lock().map_err(|e| Error::from_reason(e.to_string()))?;
399+
let batch: Vec<(String, Vec<u8>, String, String)> = items
400+
.into_iter()
401+
.map(|item| {
402+
(
403+
item.content_hash,
404+
item.embedding.to_vec(),
405+
item.chunk_text,
406+
item.model,
407+
)
408+
})
409+
.collect();
410+
db::upsert_embeddings_batch(&mut conn, &batch)
411+
.map_err(|e| Error::from_reason(e.to_string()))
412+
}
413+
414+
#[napi]
415+
pub fn upsert_chunks_batch(&self, chunks: Vec<ChunkData>) -> Result<()> {
416+
let mut conn = self.conn.lock().map_err(|e| Error::from_reason(e.to_string()))?;
417+
let batch: Vec<db::ChunkRow> = chunks
418+
.into_iter()
419+
.map(|c| db::ChunkRow {
420+
chunk_id: c.chunk_id,
421+
content_hash: c.content_hash,
422+
file_path: c.file_path,
423+
start_line: c.start_line,
424+
end_line: c.end_line,
425+
node_type: c.node_type,
426+
name: c.name,
427+
language: c.language,
428+
})
429+
.collect();
430+
db::upsert_chunks_batch(&mut conn, &batch)
431+
.map_err(|e| Error::from_reason(e.to_string()))
432+
}
433+
434+
#[napi]
435+
pub fn add_chunks_to_branch_batch(&self, branch: String, chunk_ids: Vec<String>) -> Result<()> {
436+
let mut conn = self.conn.lock().map_err(|e| Error::from_reason(e.to_string()))?;
437+
db::add_chunks_to_branch_batch(&mut conn, &branch, &chunk_ids)
438+
.map_err(|e| Error::from_reason(e.to_string()))
439+
}
440+
388441
#[napi]
389442
pub fn clear_branch(&self, branch: String) -> Result<u32> {
390443
let conn = self.conn.lock().map_err(|e| Error::from_reason(e.to_string()))?;

0 commit comments

Comments
 (0)