-
Notifications
You must be signed in to change notification settings - Fork 2
Add document deletion and update functionality #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4d1b4cd
5f1489b
0e6e25f
a201eb1
c79b9ca
65b16da
8ee8e96
c4c4b40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -192,6 +192,39 @@ public function count(): int | |
| return count($this->documents); | ||
| } | ||
|
|
||
| /** | ||
| * Remove a document from the index. | ||
| * | ||
| * @param int $nodeId Internal node-ID of the document to remove. | ||
| * @return bool True if the document was removed, false if it didn't exist. | ||
| */ | ||
| public function removeDocument(int $nodeId): bool | ||
| { | ||
| if (!isset($this->documents[$nodeId])) { | ||
| return false; | ||
| } | ||
|
|
||
| // Update totalTokens. | ||
| if (isset($this->docLengths[$nodeId])) { | ||
| $this->totalTokens -= $this->docLengths[$nodeId]; | ||
| unset($this->docLengths[$nodeId]); | ||
| } | ||
|
|
||
| // Remove from inverted index. | ||
| foreach ($this->invertedIndex as $term => &$postings) { | ||
| unset($postings[$nodeId]); | ||
| // Remove empty posting lists to save memory. | ||
| if (empty($postings)) { | ||
| unset($this->invertedIndex[$term]); | ||
|
||
| } | ||
| } | ||
| unset($postings); | ||
|
|
||
| unset($this->documents[$nodeId]); | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| /** Vocabulary size (unique terms in the index). */ | ||
| public function vocabularySize(): int | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,6 +118,14 @@ final class Index | |
| /** Expected vector dimension (set on first insert). */ | ||
| private ?int $dimension = null; | ||
|
|
||
| /** | ||
| * Set of soft-deleted node IDs. | ||
| * Deleted nodes remain in the graph for connectivity but are excluded from results. | ||
| * | ||
| * @var array<int, true> | ||
| */ | ||
| private array $deleted = []; | ||
|
|
||
| /** | ||
| * Resolved distance closure — built once in the constructor so the | ||
| * per-call match() dispatch is removed from the hot path. | ||
|
|
@@ -356,15 +364,51 @@ public function search(array $query, int $k = 10, ?int $ef = null): array | |
| // Full beam search at layer 0. | ||
| $W = $this->searchLayer($qv, [[$epDist, $ep]], $ef, 0); | ||
|
|
||
| // Take the k nearest and convert to SearchResult. | ||
| // Filter out soft-deleted nodes and take the k nearest. | ||
| if (!empty($this->deleted)) { | ||
| $W = array_values(array_filter( | ||
| $W, | ||
| fn(array $pair) => !isset($this->deleted[$pair[1]]) | ||
|
||
| )); | ||
| } | ||
|
|
||
| $topK = array_slice($W, 0, $k); | ||
| return $this->toSearchResults($topK); | ||
| } | ||
|
|
||
| /** Total number of documents in the index. */ | ||
| /** | ||
| * Total number of active (non-deleted) documents in the index. | ||
| */ | ||
| public function count(): int | ||
| { | ||
| return count($this->nodes); | ||
| return count($this->nodes) - count($this->deleted); | ||
| } | ||
|
|
||
| /** | ||
| * Soft-delete a node by its internal ID. | ||
| * | ||
| * The node remains in the graph (for connectivity) but is excluded from | ||
| * search results. This is the standard approach for HNSW deletion as | ||
| * physically removing nodes would require expensive graph repairs. | ||
| * | ||
| * @return bool True if the node was deleted, false if it didn't exist or was already deleted. | ||
| */ | ||
| public function delete(int $nodeId): bool | ||
| { | ||
| if (!isset($this->nodes[$nodeId]) || isset($this->deleted[$nodeId])) { | ||
| return false; | ||
| } | ||
|
|
||
| $this->deleted[$nodeId] = true; | ||
| return true; | ||
| } | ||
|
|
||
| /** | ||
| * Check if a node has been soft-deleted. | ||
| */ | ||
| public function isDeleted(int $nodeId): bool | ||
| { | ||
| return isset($this->deleted[$nodeId]); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -396,7 +440,8 @@ public function getDocuments(): array | |
| * maxLayer: int, | ||
| * dimension: int|null, | ||
| * nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>, | ||
| * documents: array<int, array{id: string|int, text: string|null, metadata: array}> | ||
| * documents: array<int, array{id: string|int, text: string|null, metadata: array}>, | ||
| * deleted: int[] | ||
| * } | ||
| */ | ||
| public function exportState(): array | ||
|
|
@@ -425,6 +470,7 @@ public function exportState(): array | |
| 'dimension' => $this->dimension, | ||
| 'nodes' => $nodes, | ||
| 'documents' => $documents, | ||
| 'deleted' => array_keys($this->deleted), | ||
| ]; | ||
| } | ||
|
|
||
|
|
@@ -437,7 +483,8 @@ public function exportState(): array | |
| * maxLayer: int, | ||
| * dimension: int|null, | ||
| * nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>, | ||
| * documents: array<int, array{id: string|int, text: string|null, metadata: array}> | ||
| * documents: array<int, array{id: string|int, text: string|null, metadata: array}>, | ||
| * deleted?: int[] | ||
| * } $state | ||
| */ | ||
| public function importState(array $state): void | ||
|
|
@@ -450,6 +497,14 @@ public function importState(array $state): void | |
|
|
||
| $this->nodes = []; | ||
| $this->documents = []; | ||
| $this->deleted = []; | ||
|
|
||
| // Restore deleted set. | ||
| if (!empty($state['deleted'])) { | ||
| foreach ($state['deleted'] as $deletedId) { | ||
| $this->deleted[(int) $deletedId] = true; | ||
| } | ||
| } | ||
|
|
||
| foreach ($state['nodes'] as $nodeId => $nodeData) { | ||
| $node = new Node((int) $nodeId, $nodeData['vector'], $nodeData['maxLayer']); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,6 +167,77 @@ public function addDocuments(array $documents): void | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Delete a document by its user-visible ID. | ||
| * | ||
| * The document is soft-deleted from HNSW (excluded from results but kept | ||
| * for graph connectivity) and fully removed from the BM25 index. | ||
| * | ||
| * When persistence is enabled, the document file is also deleted from disk. | ||
| * Call `save()` afterward to persist the updated index state. | ||
| * | ||
| * @param string|int $id The document ID to delete. | ||
| * @return bool True if the document was deleted, false if it didn't exist. | ||
| */ | ||
| public function deleteDocument(string|int $id): bool | ||
| { | ||
| if (!isset($this->docIdToNodeId[$id])) { | ||
| return false; | ||
| } | ||
|
|
||
| $nodeId = $this->docIdToNodeId[$id]; | ||
|
|
||
| // Soft-delete from HNSW (node stays for connectivity, excluded from results). | ||
| $this->hnswIndex->delete($nodeId); | ||
|
ezimuel marked this conversation as resolved.
Outdated
|
||
|
|
||
| // Fully remove from BM25. | ||
| $this->bm25Index->removeDocument($nodeId); | ||
|
|
||
| // Remove from local caches. | ||
| unset($this->nodeIdToDoc[$nodeId]); | ||
| unset($this->docIdToNodeId[$id]); | ||
|
|
||
| // Delete document file from disk if persistence is enabled. | ||
| if ($this->path !== null) { | ||
| $docFile = $this->path . '/docs/' . $nodeId . '.bin'; | ||
| if (file_exists($docFile)) { | ||
| unlink($docFile); | ||
|
ezimuel marked this conversation as resolved.
Outdated
|
||
| } | ||
|
Comment on lines
+208
to
+225
|
||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| /** | ||
| * Update a document by replacing it entirely. | ||
| * | ||
| * This is equivalent to deleteDocument() followed by addDocument() with the | ||
| * same ID. The document gets a new internal nodeId, so this is effectively | ||
| * a delete + insert operation. | ||
| * | ||
| * @param Document $document The updated document. Must have the same ID as an existing document. | ||
| * @return bool True if the document was updated, false if it didn't exist. | ||
| * @throws \RuntimeException if the document has no ID. | ||
| */ | ||
| public function updateDocument(Document $document): bool | ||
| { | ||
| if ($document->id === null) { | ||
| throw new \RuntimeException('Cannot update a document without an ID.'); | ||
| } | ||
|
|
||
| if (!isset($this->docIdToNodeId[$document->id])) { | ||
| return false; | ||
| } | ||
|
|
||
| // Delete the old document. | ||
| $this->deleteDocument($document->id); | ||
|
|
||
| // Insert the new version. | ||
| $this->addDocument($document); | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| // ------------------------------------------------------------------ | ||
| // Search | ||
| // ------------------------------------------------------------------ | ||
|
|
@@ -307,6 +378,7 @@ public function save(): void | |
| 'docIdToNodeId' => $this->docIdToNodeId, | ||
| 'entryPoint' => $hnswState['entryPoint'], | ||
| 'maxLayer' => $hnswState['maxLayer'], | ||
| 'deleted' => $hnswState['deleted'], | ||
| ]; | ||
| if (file_put_contents($this->path . '/meta.json', json_encode($meta, JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR)) === false) { | ||
| throw new \RuntimeException("Failed to write meta.json in: {$this->path}"); | ||
|
|
@@ -376,6 +448,7 @@ public static function open( | |
| // HNSW needs these in $documents[] to return SearchResult objects. | ||
| $hnswState = $hnswData; | ||
| $hnswState['documents'] = []; | ||
| $hnswState['deleted'] = $meta['deleted'] ?? []; | ||
| foreach ($hnswData['nodes'] as $nodeId => $nodeData) { | ||
| $docId = $nodeIdToDocId[$nodeId] ?? $nodeId; | ||
| $hnswState['documents'][$nodeId] = [ | ||
|
|
@@ -407,10 +480,10 @@ public static function open( | |
| // Utilities | ||
| // ------------------------------------------------------------------ | ||
|
|
||
| /** Total number of documents stored. */ | ||
| /** Total number of active (non-deleted) documents stored. */ | ||
| public function count(): int | ||
| { | ||
| return $this->nextId; | ||
| return $this->hnswIndex->count(); | ||
| } | ||
|
|
||
| // ------------------------------------------------------------------ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -387,4 +387,73 @@ public function testIncrementalSave(): void | |
| $ids = array_map(fn($r) => $r->document->id, $results); | ||
| self::assertContains(1, $ids); | ||
| } | ||
|
|
||
| // ------------------------------------------------------------------ | ||
| // Delete persistence | ||
| // ------------------------------------------------------------------ | ||
|
|
||
| public function testDeletedDocumentsArePersistedAndExcluded(): void | ||
| { | ||
| $db = $this->makeDb(); | ||
| $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'first')); | ||
| $db->addDocument(new Document(id: 2, vector: [0.9, 0.1], text: 'second')); | ||
| $db->addDocument(new Document(id: 3, vector: [0.0, 1.0], text: 'third')); | ||
|
|
||
| // Delete document 1 | ||
| $db->deleteDocument(1); | ||
| $db->save(); | ||
|
|
||
| // Reload and verify | ||
| $loaded = $this->openDb(); | ||
| self::assertSame(2, $loaded->count()); | ||
|
|
||
| // Document 1 should not appear in results | ||
| $results = $loaded->vectorSearch([1.0, 0.0], k: 3); | ||
| $ids = array_map(fn($r) => $r->document->id, $results); | ||
| self::assertNotContains(1, $ids); | ||
| self::assertContains(2, $ids); | ||
| } | ||
|
Comment on lines
+411
to
+415
|
||
|
|
||
| public function testDeletedDocumentFileIsRemoved(): void | ||
| { | ||
| $db = $this->makeDb(); | ||
| $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'to delete')); | ||
| $db->addDocument(new Document(id: 2, vector: [0.0, 1.0], text: 'to keep')); | ||
| $db->save(); | ||
|
|
||
| // Verify doc file exists | ||
| self::assertFileExists($this->tmpDir . '/docs/0.bin'); | ||
| self::assertFileExists($this->tmpDir . '/docs/1.bin'); | ||
|
|
||
| // Delete document 1 (which is nodeId 0) | ||
| $db->deleteDocument(1); | ||
|
|
||
| // Doc file should be removed immediately | ||
| self::assertFileDoesNotExist($this->tmpDir . '/docs/0.bin'); | ||
| self::assertFileExists($this->tmpDir . '/docs/1.bin'); | ||
| } | ||
|
|
||
| public function testUpdateDocumentPersistsCorrectly(): void | ||
| { | ||
| $db = $this->makeDb(); | ||
| $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'original')); | ||
| $db->save(); | ||
|
|
||
| // Update the document | ||
| $db->updateDocument(new Document( | ||
| id: 1, | ||
| vector: [0.0, 1.0], | ||
| text: 'updated content', | ||
| metadata: ['version' => 2], | ||
| )); | ||
| $db->save(); | ||
|
|
||
| // Reload and verify | ||
| $loaded = $this->openDb(); | ||
| $results = $loaded->vectorSearch([0.0, 1.0], k: 1); | ||
|
|
||
| self::assertSame(1, $results[0]->document->id); | ||
| self::assertSame('updated content', $results[0]->document->text); | ||
| self::assertSame(['version' => 2], $results[0]->document->metadata); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removeDocument()scans the entire$invertedIndexvocabulary to remove a single nodeId, which is O(|V|) per delete and can become expensive as the index grows (delete/update are now public APIs). A more scalable approach is to track per-document term lists on insert so deletion only touches terms that were present in the removed document.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in c79b9ca