Skip to content

Commit e8eac5a

Browse files
Merge branch 'main' into feat/update-delete-benchmark-refactor
2 parents 4d40ade + 5ae150a commit e8eac5a

File tree

5 files changed

+163
-38
lines changed

5 files changed

+163
-38
lines changed

src/BM25/Index.php

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ final class Index
3838
/** @var array<int, Document> nodeId → Document */
3939
private array $documents = [];
4040

41+
/**
42+
* Per-document term list (unique terms only).
43+
* Enables O(|terms in doc|) deletion instead of O(|vocabulary|).
44+
* @var array<int, string[]>
45+
*/
46+
private array $docTerms = [];
47+
4148
public function __construct(
4249
private readonly Config $config = new Config(),
4350
private readonly TokenizerInterface $tokenizer = new SimpleTokenizer(),
@@ -73,6 +80,9 @@ public function addDocument(int $nodeId, Document $document): void
7380
foreach ($termFreqs as $term => $tf) {
7481
$this->invertedIndex[$term][$nodeId] = $tf;
7582
}
83+
84+
// Track which terms this document contributed so removal is O(|terms in doc|).
85+
$this->docTerms[$nodeId] = array_keys($termFreqs);
7686
}
7787

7888
/**
@@ -210,15 +220,15 @@ public function removeDocument(int $nodeId): bool
210220
unset($this->docLengths[$nodeId]);
211221
}
212222

213-
// Remove from inverted index.
214-
foreach ($this->invertedIndex as $term => &$postings) {
215-
unset($postings[$nodeId]);
223+
// Remove from inverted index — only touch terms this document contained.
224+
foreach ($this->docTerms[$nodeId] ?? [] as $term) {
225+
unset($this->invertedIndex[$term][$nodeId]);
216226
// Remove empty posting lists to save memory.
217-
if (empty($postings)) {
227+
if (empty($this->invertedIndex[$term])) {
218228
unset($this->invertedIndex[$term]);
219229
}
220230
}
221-
unset($postings);
231+
unset($this->docTerms[$nodeId]);
222232

223233
unset($this->documents[$nodeId]);
224234

@@ -237,7 +247,8 @@ public function vocabularySize(): int
237247
* @return array{
238248
* totalTokens: int,
239249
* docLengths: array<int, int>,
240-
* invertedIndex: array<string, array<int, int>>
250+
* invertedIndex: array<string, array<int, int>>,
251+
* docTerms: array<int, string[]>
241252
* }
242253
*/
243254
public function exportState(): array
@@ -246,6 +257,7 @@ public function exportState(): array
246257
'totalTokens' => $this->totalTokens,
247258
'docLengths' => $this->docLengths,
248259
'invertedIndex' => $this->invertedIndex,
260+
'docTerms' => $this->docTerms,
249261
];
250262
}
251263

@@ -256,7 +268,8 @@ public function exportState(): array
256268
* @param array{
257269
* totalTokens: int,
258270
* docLengths: array<int, int>,
259-
* invertedIndex: array<string, array<int, int>>
271+
* invertedIndex: array<string, array<int, int>>,
272+
* docTerms?: array<int, string[]>
260273
* } $state
261274
* @param array<int, Document> $documents nodeId → Document (from HNSW index)
262275
*/
@@ -266,5 +279,18 @@ public function importState(array $state, array $documents): void
266279
$this->docLengths = $state['docLengths'];
267280
$this->invertedIndex = $state['invertedIndex'];
268281
$this->documents = $documents;
282+
283+
// Rebuild docTerms from the inverted index when loading older snapshots
284+
// that were persisted before this field was introduced.
285+
if (isset($state['docTerms'])) {
286+
$this->docTerms = $state['docTerms'];
287+
} else {
288+
$this->docTerms = [];
289+
foreach ($this->invertedIndex as $term => $postings) {
290+
foreach (array_keys($postings) as $nId) {
291+
$this->docTerms[$nId][] = $term;
292+
}
293+
}
294+
}
269295
}
270296
}

src/HNSW/Index.php

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -361,16 +361,31 @@ public function search(array $query, int $k = 10, ?int $ef = null): array
361361
[$epDist, $ep] = $this->searchLayerGreedy($qv, $ep, $epDist, $lc);
362362
}
363363

364-
// Full beam search at layer 0.
365-
$W = $this->searchLayer($qv, [[$epDist, $ep]], $ef, 0);
364+
// Full beam search at layer 0, retrying with a larger ef when soft-deleted
365+
// nodes shrink the active result set below $k. Doubling ef on each retry
366+
// costs at most O(log(totalNodes / ef)) extra passes in the worst case.
367+
$currentEf = $ef;
368+
$totalNodes = count($this->nodes);
369+
370+
do {
371+
$W = $this->searchLayer($qv, [[$epDist, $ep]], $currentEf, 0);
372+
373+
// Filter out soft-deleted nodes.
374+
if (!empty($this->deleted)) {
375+
$W = array_values(array_filter(
376+
$W,
377+
fn(array $pair) => !isset($this->deleted[$pair[1]])
378+
));
379+
}
366380

367-
// Filter out soft-deleted nodes and take the k nearest.
368-
if (!empty($this->deleted)) {
369-
$W = array_values(array_filter(
370-
$W,
371-
fn(array $pair) => !isset($this->deleted[$pair[1]])
372-
));
373-
}
381+
// Stop when we have enough active results, or ef already spans all nodes
382+
// (further expansion cannot surface new candidates).
383+
if (count($W) >= $k || $currentEf >= $totalNodes) {
384+
break;
385+
}
386+
387+
$currentEf = min($currentEf * 2, $totalNodes);
388+
} while (true);
374389

375390
$topK = array_slice($W, 0, $k);
376391
return $this->toSearchResults($topK);

src/Persistence/DocumentStore.php

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@
2020
*/
2121
final class DocumentStore
2222
{
23-
/** @var int[] PIDs of outstanding async child processes. */
23+
/**
24+
* PIDs of outstanding async child processes, keyed by nodeId.
25+
* Keying by nodeId lets waitForNode() drain exactly one write without
26+
* blocking every other in-flight write.
27+
*
28+
* @var array<int, int> nodeId → PID
29+
*/
2430
private array $pendingPids = [];
2531

2632
public function __construct(private readonly string $docsDir) {}
@@ -59,8 +65,8 @@ public function write(
5965
$this->writeSync($nodeId, $docId, $text, $metadata);
6066
exit(0);
6167
} else {
62-
// Parent: record PID and return.
63-
$this->pendingPids[] = $pid;
68+
// Parent: record PID keyed by nodeId and return.
69+
$this->pendingPids[$nodeId] = $pid;
6470
return;
6571
}
6672
}
@@ -69,6 +75,25 @@ public function write(
6975
$this->writeSync($nodeId, $docId, $text, $metadata);
7076
}
7177

78+
/**
79+
* Block until the async write for a specific node has completed.
80+
*
81+
* Use this before deleting a node's file so a late child write cannot
82+
* recreate {nodeId}.bin after the unlink().
83+
*/
84+
public function waitForNode(int $nodeId): void
85+
{
86+
if (!isset($this->pendingPids[$nodeId])) {
87+
return;
88+
}
89+
90+
if (function_exists('pcntl_waitpid')) {
91+
pcntl_waitpid($this->pendingPids[$nodeId], $status);
92+
}
93+
94+
unset($this->pendingPids[$nodeId]);
95+
}
96+
7297
/**
7398
* Block until every outstanding async write has completed.
7499
* Must be called before index files are written (see VectorDatabase::save()).

src/VectorDatabase.php

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,9 @@ public function addDocuments(array $documents): void
173173
* The document is soft-deleted from HNSW (excluded from results but kept
174174
* for graph connectivity) and fully removed from the BM25 index.
175175
*
176-
* When persistence is enabled, the document file is also deleted from disk.
176+
* When persistence is enabled, a tombstone marker is written immediately so
177+
* the deletion survives a crash. The physical doc file is removed during
178+
* the next `save()` call, after the indexes are fully updated on disk.
177179
* Call `save()` afterward to persist the updated index state.
178180
*
179181
* @param string|int $id The document ID to delete.
@@ -202,17 +204,24 @@ public function deleteDocument(string|int $id): bool
202204
unset($this->nodeIdToDoc[$nodeId]);
203205
unset($this->docIdToNodeId[$id]);
204206

205-
// Delete document file from disk if persistence is enabled.
207+
// Mark the document for physical deletion when persistence is enabled.
206208
if ($this->path !== null) {
207-
$docFile = $this->path . '/docs/' . $nodeId . '.bin';
208-
if (file_exists($docFile)) {
209-
// Suppress PHP warning and handle failure explicitly to keep
210-
// on-disk state consistent with in-memory indexes.
211-
if (!@unlink($docFile) && file_exists($docFile)) {
212-
throw new \RuntimeException(
213-
"Failed to delete persisted document file: {$docFile}"
214-
);
215-
}
209+
// An async pcntl_fork child may still be writing {nodeId}.bin.
210+
// Wait for it to finish so the file is fully on disk before we
211+
// record the tombstone — this keeps the pair (bin + tombstone)
212+
// consistent from the moment the tombstone is created.
213+
$this->getDocumentStore()->waitForNode($nodeId);
214+
215+
// Write a tombstone instead of immediately removing the doc file.
216+
// The physical removal happens in save() AFTER the index files have
217+
// been updated, giving us crash-safety:
218+
// • crash before save() → open() finds the tombstone and
219+
// re-applies the deletion in memory.
220+
// • crash during save() → at worst the doc file is an orphan;
221+
// the indexes already reflect the deletion.
222+
$tombstone = $this->path . '/docs/' . $nodeId . '.tombstone';
223+
if (file_put_contents($tombstone, '') === false) {
224+
throw new \RuntimeException("Failed to write tombstone file: {$tombstone}");
216225
}
217226
}
218227

@@ -354,9 +363,11 @@ public function hybridSearch(
354363
* 2. `meta.json` — distance code, dimension, nextId, docIdToNodeId.
355364
* 3. `hnsw.bin` — HNSW graph (vectors + connections).
356365
* 4. `bm25.bin` — BM25 inverted index.
366+
* 5. Removes `docs/{n}.bin` + `docs/{n}.tombstone` for every pending deletion.
357367
*
358368
* Individual `docs/{n}.bin` files are written incrementally by `addDocument()`
359-
* and are NOT re-written by this method.
369+
* and are NOT re-written by this method. Deletion of doc files is deferred
370+
* to this method so the on-disk state is always consistent.
360371
*
361372
* @throws \RuntimeException if no path was configured or on I/O failure.
362373
*/
@@ -398,6 +409,18 @@ public function save(): void
398409
$serializer = new IndexSerializer();
399410
$serializer->writeHnsw($this->path . '/hnsw.bin', $hnswState);
400411
$serializer->writeBm25($this->path . '/bm25.bin', $this->bm25Index->exportState());
412+
413+
// Now that all index files reflect the current state, it is safe to
414+
// physically remove doc files for pending tombstone deletions.
415+
$docsDir = $this->path . '/docs';
416+
foreach (glob($docsDir . '/*.tombstone') ?: [] as $tombstoneFile) {
417+
$nodeId = (int) basename($tombstoneFile, '.tombstone');
418+
$binFile = $docsDir . '/' . $nodeId . '.bin';
419+
if (file_exists($binFile)) {
420+
@unlink($binFile);
421+
}
422+
@unlink($tombstoneFile);
423+
}
401424
}
402425

403426
/**
@@ -484,6 +507,32 @@ public static function open(
484507

485508
// $db->nodeIdToDoc intentionally starts EMPTY — documents are lazy-loaded.
486509

510+
// ── Reconcile crash-interrupted deletions ─────────────────────────
511+
// A tombstone file docs/{nodeId}.tombstone is written by deleteDocument()
512+
// before save() is called. If the process crashed between those two
513+
// steps the tombstone survives but the indexes were not yet updated.
514+
// Re-apply the pending deletion now so the loaded state is consistent.
515+
$docsDir = $path . '/docs';
516+
if (is_dir($docsDir)) {
517+
foreach (glob($docsDir . '/*.tombstone') ?: [] as $tombstoneFile) {
518+
$nodeId = (int) basename($tombstoneFile, '.tombstone');
519+
520+
// Apply the deletion only when the node is still present in the
521+
// loaded indexes (i.e., save() had not yet been called).
522+
if (isset($nodeIdToDocId[$nodeId])) {
523+
$docId = $nodeIdToDocId[$nodeId];
524+
$db->hnswIndex->delete($nodeId);
525+
$db->bm25Index->removeDocument($nodeId);
526+
unset($db->docIdToNodeId[$docId]);
527+
}
528+
529+
// Always clean up — covers the edge case where the process
530+
// crashed after indexes were written but before file removal.
531+
@unlink($docsDir . '/' . $nodeId . '.bin');
532+
@unlink($tombstoneFile);
533+
}
534+
}
535+
487536
return $db;
488537
}
489538

tests/PersistenceTest.php

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -421,16 +421,26 @@ public function testDeletedDocumentFileIsRemoved(): void
421421
$db->addDocument(new Document(id: 2, vector: [0.0, 1.0], text: 'to keep'));
422422
$db->save();
423423

424-
// Verify doc files exist (do not rely on specific nodeId-to-filename mapping)
425-
$beforeFiles = glob($this->tmpDir . '/docs/*.bin') ?: [];
426-
self::assertCount(2, $beforeFiles);
424+
// Verify doc files exist after initial save.
425+
self::assertFileExists($this->tmpDir . '/docs/0.bin');
426+
self::assertFileExists($this->tmpDir . '/docs/1.bin');
427427

428-
// Delete one document
428+
// Delete document 1 (nodeId 0).
429+
// Physical removal is deferred to save() for crash-safety; a tombstone
430+
// is written immediately so the deletion survives a crash.
429431
$db->deleteDocument(1);
430432

431-
// One doc file should be removed immediately
432-
$afterFiles = glob($this->tmpDir . '/docs/*.bin') ?: [];
433-
self::assertCount(1, $afterFiles);
433+
self::assertFileExists($this->tmpDir . '/docs/0.tombstone', 'Tombstone must be created by deleteDocument().');
434+
self::assertFileExists($this->tmpDir . '/docs/0.bin', 'Doc file must NOT be removed before save().');
435+
self::assertFileExists($this->tmpDir . '/docs/1.bin');
436+
437+
// After save() both the doc file and the tombstone must be gone.
438+
$db->save();
439+
440+
self::assertFileDoesNotExist($this->tmpDir . '/docs/0.bin');
441+
self::assertFileDoesNotExist($this->tmpDir . '/docs/0.tombstone');
442+
self::assertFileExists($this->tmpDir . '/docs/1.bin');
443+
434444
}
435445

436446
public function testUpdateDocumentPersistsCorrectly(): void

0 commit comments

Comments
 (0)