Skip to content

Commit 5ae150a

Browse files
authored
Merge pull request #3 from danielebarbaro/feat/update-delete
Add document deletion and update functionality
2 parents 8aad8c9 + c4c4b40 commit 5ae150a

File tree

7 files changed

+526
-15
lines changed

7 files changed

+526
-15
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,26 @@ Available providers:
347347
- `ItalianStopWords` - Italian stop words
348348
- `FileStopWords` - Load from file
349349
350+
## Deleting and updating documents
351+
352+
```php
353+
// Delete a document by ID
354+
$deleted = $db->deleteDocument(1); // returns true if found, false otherwise
355+
356+
// Update a document (delete + insert with same ID)
357+
$updated = $db->updateDocument(new Document(
358+
id: 1,
359+
vector: [0.5, 0.5, 0.3, 0.2],
360+
text: 'Updated content here',
361+
metadata: ['version' => 2],
362+
));
363+
364+
// After modifications, call save() to persist
365+
$db->save();
366+
```
367+
368+
Deleted documents are soft-deleted from the HNSW graph (kept for connectivity but excluded from results) and fully removed from the BM25 index. Document files are deleted from disk immediately.
369+
350370
## Custom tokenizer
351371

352372
Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic.

src/BM25/Index.php

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ final class Index
3838
/** @var array<int, Document> nodeId → Document */
3939
private array $documents = [];
4040

41+
/**
42+
* Per-document term list (unique terms only).
43+
* Enables O(|terms in doc|) deletion instead of O(|vocabulary|).
44+
* @var array<int, string[]>
45+
*/
46+
private array $docTerms = [];
47+
4148
public function __construct(
4249
private readonly Config $config = new Config(),
4350
private readonly TokenizerInterface $tokenizer = new SimpleTokenizer(),
@@ -73,6 +80,9 @@ public function addDocument(int $nodeId, Document $document): void
7380
foreach ($termFreqs as $term => $tf) {
7481
$this->invertedIndex[$term][$nodeId] = $tf;
7582
}
83+
84+
// Track which terms this document contributed so removal is O(|terms in doc|).
85+
$this->docTerms[$nodeId] = array_keys($termFreqs);
7686
}
7787

7888
/**
@@ -192,6 +202,39 @@ public function count(): int
192202
return count($this->documents);
193203
}
194204

205+
/**
206+
* Remove a document from the index.
207+
*
208+
* @param int $nodeId Internal node-ID of the document to remove.
209+
* @return bool True if the document was removed, false if it didn't exist.
210+
*/
211+
public function removeDocument(int $nodeId): bool
212+
{
213+
if (!isset($this->documents[$nodeId])) {
214+
return false;
215+
}
216+
217+
// Update totalTokens.
218+
if (isset($this->docLengths[$nodeId])) {
219+
$this->totalTokens -= $this->docLengths[$nodeId];
220+
unset($this->docLengths[$nodeId]);
221+
}
222+
223+
// Remove from inverted index — only touch terms this document contained.
224+
foreach ($this->docTerms[$nodeId] ?? [] as $term) {
225+
unset($this->invertedIndex[$term][$nodeId]);
226+
// Remove empty posting lists to save memory.
227+
if (empty($this->invertedIndex[$term])) {
228+
unset($this->invertedIndex[$term]);
229+
}
230+
}
231+
unset($this->docTerms[$nodeId]);
232+
233+
unset($this->documents[$nodeId]);
234+
235+
return true;
236+
}
237+
195238
/** Vocabulary size (unique terms in the index). */
196239
public function vocabularySize(): int
197240
{
@@ -204,7 +247,8 @@ public function vocabularySize(): int
204247
* @return array{
205248
* totalTokens: int,
206249
* docLengths: array<int, int>,
207-
* invertedIndex: array<string, array<int, int>>
250+
* invertedIndex: array<string, array<int, int>>,
251+
* docTerms: array<int, string[]>
208252
* }
209253
*/
210254
public function exportState(): array
@@ -213,6 +257,7 @@ public function exportState(): array
213257
'totalTokens' => $this->totalTokens,
214258
'docLengths' => $this->docLengths,
215259
'invertedIndex' => $this->invertedIndex,
260+
'docTerms' => $this->docTerms,
216261
];
217262
}
218263

@@ -223,7 +268,8 @@ public function exportState(): array
223268
* @param array{
224269
* totalTokens: int,
225270
* docLengths: array<int, int>,
226-
* invertedIndex: array<string, array<int, int>>
271+
* invertedIndex: array<string, array<int, int>>,
272+
* docTerms?: array<int, string[]>
227273
* } $state
228274
* @param array<int, Document> $documents nodeId → Document (from HNSW index)
229275
*/
@@ -233,5 +279,18 @@ public function importState(array $state, array $documents): void
233279
$this->docLengths = $state['docLengths'];
234280
$this->invertedIndex = $state['invertedIndex'];
235281
$this->documents = $documents;
282+
283+
// Rebuild docTerms from the inverted index when loading older snapshots
284+
// that were persisted before this field was introduced.
285+
if (isset($state['docTerms'])) {
286+
$this->docTerms = $state['docTerms'];
287+
} else {
288+
$this->docTerms = [];
289+
foreach ($this->invertedIndex as $term => $postings) {
290+
foreach (array_keys($postings) as $nId) {
291+
$this->docTerms[$nId][] = $term;
292+
}
293+
}
294+
}
236295
}
237296
}

src/HNSW/Index.php

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,14 @@ final class Index
118118
/** Expected vector dimension (set on first insert). */
119119
private ?int $dimension = null;
120120

121+
/**
122+
* Set of soft-deleted node IDs.
123+
* Deleted nodes remain in the graph for connectivity but are excluded from results.
124+
*
125+
* @var array<int, true>
126+
*/
127+
private array $deleted = [];
128+
121129
/**
122130
* Resolved distance closure — built once in the constructor so the
123131
* per-call match() dispatch is removed from the hot path.
@@ -353,18 +361,69 @@ public function search(array $query, int $k = 10, ?int $ef = null): array
353361
[$epDist, $ep] = $this->searchLayerGreedy($qv, $ep, $epDist, $lc);
354362
}
355363

356-
// Full beam search at layer 0.
357-
$W = $this->searchLayer($qv, [[$epDist, $ep]], $ef, 0);
364+
// Full beam search at layer 0, retrying with a larger ef when soft-deleted
365+
// nodes shrink the active result set below $k. Doubling ef on each retry
366+
// costs at most O(log(totalNodes / ef)) extra passes in the worst case.
367+
$currentEf = $ef;
368+
$totalNodes = count($this->nodes);
369+
370+
do {
371+
$W = $this->searchLayer($qv, [[$epDist, $ep]], $currentEf, 0);
372+
373+
// Filter out soft-deleted nodes.
374+
if (!empty($this->deleted)) {
375+
$W = array_values(array_filter(
376+
$W,
377+
fn(array $pair) => !isset($this->deleted[$pair[1]])
378+
));
379+
}
380+
381+
// Stop when we have enough active results, or ef already spans all nodes
382+
// (further expansion cannot surface new candidates).
383+
if (count($W) >= $k || $currentEf >= $totalNodes) {
384+
break;
385+
}
386+
387+
$currentEf = min($currentEf * 2, $totalNodes);
388+
} while (true);
358389

359-
// Take the k nearest and convert to SearchResult.
360390
$topK = array_slice($W, 0, $k);
361391
return $this->toSearchResults($topK);
362392
}
363393

364-
/** Total number of documents in the index. */
394+
/**
395+
* Total number of active (non-deleted) documents in the index.
396+
*/
365397
public function count(): int
366398
{
367-
return count($this->nodes);
399+
return count($this->nodes) - count($this->deleted);
400+
}
401+
402+
/**
403+
* Soft-delete a node by its internal ID.
404+
*
405+
* The node remains in the graph (for connectivity) but is excluded from
406+
* search results. This is the standard approach for HNSW deletion as
407+
* physically removing nodes would require expensive graph repairs.
408+
*
409+
* @return bool True if the node was deleted, false if it didn't exist or was already deleted.
410+
*/
411+
public function delete(int $nodeId): bool
412+
{
413+
if (!isset($this->nodes[$nodeId]) || isset($this->deleted[$nodeId])) {
414+
return false;
415+
}
416+
417+
$this->deleted[$nodeId] = true;
418+
return true;
419+
}
420+
421+
/**
422+
* Check if a node has been soft-deleted.
423+
*/
424+
public function isDeleted(int $nodeId): bool
425+
{
426+
return isset($this->deleted[$nodeId]);
368427
}
369428

370429
/**
@@ -396,7 +455,8 @@ public function getDocuments(): array
396455
* maxLayer: int,
397456
* dimension: int|null,
398457
* nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>,
399-
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>
458+
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>,
459+
* deleted: int[]
400460
* }
401461
*/
402462
public function exportState(): array
@@ -425,6 +485,7 @@ public function exportState(): array
425485
'dimension' => $this->dimension,
426486
'nodes' => $nodes,
427487
'documents' => $documents,
488+
'deleted' => array_keys($this->deleted),
428489
];
429490
}
430491

@@ -437,7 +498,8 @@ public function exportState(): array
437498
* maxLayer: int,
438499
* dimension: int|null,
439500
* nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>,
440-
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>
501+
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>,
502+
* deleted?: int[]
441503
* } $state
442504
*/
443505
public function importState(array $state): void
@@ -450,6 +512,14 @@ public function importState(array $state): void
450512

451513
$this->nodes = [];
452514
$this->documents = [];
515+
$this->deleted = [];
516+
517+
// Restore deleted set.
518+
if (!empty($state['deleted'])) {
519+
foreach ($state['deleted'] as $deletedId) {
520+
$this->deleted[(int) $deletedId] = true;
521+
}
522+
}
453523

454524
foreach ($state['nodes'] as $nodeId => $nodeData) {
455525
$node = new Node((int) $nodeId, $nodeData['vector'], $nodeData['maxLayer']);

src/Persistence/DocumentStore.php

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@
2020
*/
2121
final class DocumentStore
2222
{
23-
/** @var int[] PIDs of outstanding async child processes. */
23+
/**
24+
* PIDs of outstanding async child processes, keyed by nodeId.
25+
* Keying by nodeId lets waitForNode() drain exactly one write without
26+
* blocking every other in-flight write.
27+
*
28+
* @var array<int, int> nodeId → PID
29+
*/
2430
private array $pendingPids = [];
2531

2632
public function __construct(private readonly string $docsDir) {}
@@ -59,8 +65,8 @@ public function write(
5965
$this->writeSync($nodeId, $docId, $text, $metadata);
6066
exit(0);
6167
} else {
62-
// Parent: record PID and return.
63-
$this->pendingPids[] = $pid;
68+
// Parent: record PID keyed by nodeId and return.
69+
$this->pendingPids[$nodeId] = $pid;
6470
return;
6571
}
6672
}
@@ -69,6 +75,25 @@ public function write(
6975
$this->writeSync($nodeId, $docId, $text, $metadata);
7076
}
7177

78+
/**
79+
* Block until the async write for a specific node has completed.
80+
*
81+
* Use this before deleting a node's file so a late child write cannot
82+
* recreate {nodeId}.bin after the unlink().
83+
*/
84+
public function waitForNode(int $nodeId): void
85+
{
86+
if (!isset($this->pendingPids[$nodeId])) {
87+
return;
88+
}
89+
90+
if (function_exists('pcntl_waitpid')) {
91+
pcntl_waitpid($this->pendingPids[$nodeId], $status);
92+
}
93+
94+
unset($this->pendingPids[$nodeId]);
95+
}
96+
7297
/**
7398
* Block until every outstanding async write has completed.
7499
* Must be called before index files are written (see VectorDatabase::save()).

0 commit comments

Comments
 (0)