diff --git a/README.md b/README.md index 81f5cda..1483574 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,26 @@ Available providers: - `ItalianStopWords` - Italian stop words - `FileStopWords` - Load from file +## Deleting and updating documents + +```php +// Delete a document by ID +$deleted = $db->deleteDocument(1); // returns true if found, false otherwise + +// Update a document (delete + insert with same ID) +$updated = $db->updateDocument(new Document( + id: 1, + vector: [0.5, 0.5, 0.3, 0.2], + text: 'Updated content here', + metadata: ['version' => 2], +)); + +// After modifications, call save() to persist +$db->save(); +``` + +Deleted documents are soft-deleted from the HNSW graph (kept for connectivity but excluded from results) and fully removed from the BM25 index. Document files are deleted from disk immediately. + ## Custom tokenizer Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic. diff --git a/src/BM25/Index.php b/src/BM25/Index.php index 35050b7..6230c55 100644 --- a/src/BM25/Index.php +++ b/src/BM25/Index.php @@ -38,6 +38,13 @@ final class Index /** @var array nodeId → Document */ private array $documents = []; + /** + * Per-document term list (unique terms only). + * Enables O(|terms in doc|) deletion instead of O(|vocabulary|). + * @var array + */ + private array $docTerms = []; + public function __construct( private readonly Config $config = new Config(), private readonly TokenizerInterface $tokenizer = new SimpleTokenizer(), @@ -73,6 +80,9 @@ public function addDocument(int $nodeId, Document $document): void foreach ($termFreqs as $term => $tf) { $this->invertedIndex[$term][$nodeId] = $tf; } + + // Track which terms this document contributed so removal is O(|terms in doc|). + $this->docTerms[$nodeId] = array_keys($termFreqs); } /** @@ -192,6 +202,39 @@ public function count(): int return count($this->documents); } + /** + * Remove a document from the index. + * + * @param int $nodeId Internal node-ID of the document to remove. + * @return bool True if the document was removed, false if it didn't exist. + */ + public function removeDocument(int $nodeId): bool + { + if (!isset($this->documents[$nodeId])) { + return false; + } + + // Update totalTokens. + if (isset($this->docLengths[$nodeId])) { + $this->totalTokens -= $this->docLengths[$nodeId]; + unset($this->docLengths[$nodeId]); + } + + // Remove from inverted index — only touch terms this document contained. + foreach ($this->docTerms[$nodeId] ?? [] as $term) { + unset($this->invertedIndex[$term][$nodeId]); + // Remove empty posting lists to save memory. + if (empty($this->invertedIndex[$term])) { + unset($this->invertedIndex[$term]); + } + } + unset($this->docTerms[$nodeId]); + + unset($this->documents[$nodeId]); + + return true; + } + /** Vocabulary size (unique terms in the index). */ public function vocabularySize(): int { @@ -204,7 +247,8 @@ public function vocabularySize(): int * @return array{ * totalTokens: int, * docLengths: array, - * invertedIndex: array> + * invertedIndex: array>, + * docTerms: array * } */ public function exportState(): array @@ -213,6 +257,7 @@ public function exportState(): array 'totalTokens' => $this->totalTokens, 'docLengths' => $this->docLengths, 'invertedIndex' => $this->invertedIndex, + 'docTerms' => $this->docTerms, ]; } @@ -223,7 +268,8 @@ public function exportState(): array * @param array{ * totalTokens: int, * docLengths: array, - * invertedIndex: array> + * invertedIndex: array>, + * docTerms?: array * } $state * @param array $documents nodeId → Document (from HNSW index) */ @@ -233,5 +279,18 @@ public function importState(array $state, array $documents): void $this->docLengths = $state['docLengths']; $this->invertedIndex = $state['invertedIndex']; $this->documents = $documents; + + // Rebuild docTerms from the inverted index when loading older snapshots + // that were persisted before this field was introduced. + if (isset($state['docTerms'])) { + $this->docTerms = $state['docTerms']; + } else { + $this->docTerms = []; + foreach ($this->invertedIndex as $term => $postings) { + foreach (array_keys($postings) as $nId) { + $this->docTerms[$nId][] = $term; + } + } + } } } diff --git a/src/HNSW/Index.php b/src/HNSW/Index.php index 74217e7..ca0cd6f 100644 --- a/src/HNSW/Index.php +++ b/src/HNSW/Index.php @@ -118,6 +118,14 @@ final class Index /** Expected vector dimension (set on first insert). */ private ?int $dimension = null; + /** + * Set of soft-deleted node IDs. + * Deleted nodes remain in the graph for connectivity but are excluded from results. + * + * @var array + */ + private array $deleted = []; + /** * Resolved distance closure — built once in the constructor so the * per-call match() dispatch is removed from the hot path. @@ -353,18 +361,69 @@ public function search(array $query, int $k = 10, ?int $ef = null): array [$epDist, $ep] = $this->searchLayerGreedy($qv, $ep, $epDist, $lc); } - // Full beam search at layer 0. - $W = $this->searchLayer($qv, [[$epDist, $ep]], $ef, 0); + // Full beam search at layer 0, retrying with a larger ef when soft-deleted + // nodes shrink the active result set below $k. Doubling ef on each retry + // costs at most O(log(totalNodes / ef)) extra passes in the worst case. + $currentEf = $ef; + $totalNodes = count($this->nodes); + + do { + $W = $this->searchLayer($qv, [[$epDist, $ep]], $currentEf, 0); + + // Filter out soft-deleted nodes. + if (!empty($this->deleted)) { + $W = array_values(array_filter( + $W, + fn(array $pair) => !isset($this->deleted[$pair[1]]) + )); + } + + // Stop when we have enough active results, or ef already spans all nodes + // (further expansion cannot surface new candidates). + if (count($W) >= $k || $currentEf >= $totalNodes) { + break; + } + + $currentEf = min($currentEf * 2, $totalNodes); + } while (true); - // Take the k nearest and convert to SearchResult. $topK = array_slice($W, 0, $k); return $this->toSearchResults($topK); } - /** Total number of documents in the index. */ + /** + * Total number of active (non-deleted) documents in the index. + */ public function count(): int { - return count($this->nodes); + return count($this->nodes) - count($this->deleted); + } + + /** + * Soft-delete a node by its internal ID. + * + * The node remains in the graph (for connectivity) but is excluded from + * search results. This is the standard approach for HNSW deletion as + * physically removing nodes would require expensive graph repairs. + * + * @return bool True if the node was deleted, false if it didn't exist or was already deleted. + */ + public function delete(int $nodeId): bool + { + if (!isset($this->nodes[$nodeId]) || isset($this->deleted[$nodeId])) { + return false; + } + + $this->deleted[$nodeId] = true; + return true; + } + + /** + * Check if a node has been soft-deleted. + */ + public function isDeleted(int $nodeId): bool + { + return isset($this->deleted[$nodeId]); } /** @@ -396,7 +455,8 @@ public function getDocuments(): array * maxLayer: int, * dimension: int|null, * nodes: array}>, - * documents: array + * documents: array, + * deleted: int[] * } */ public function exportState(): array @@ -425,6 +485,7 @@ public function exportState(): array 'dimension' => $this->dimension, 'nodes' => $nodes, 'documents' => $documents, + 'deleted' => array_keys($this->deleted), ]; } @@ -437,7 +498,8 @@ public function exportState(): array * maxLayer: int, * dimension: int|null, * nodes: array}>, - * documents: array + * documents: array, + * deleted?: int[] * } $state */ public function importState(array $state): void @@ -450,6 +512,14 @@ public function importState(array $state): void $this->nodes = []; $this->documents = []; + $this->deleted = []; + + // Restore deleted set. + if (!empty($state['deleted'])) { + foreach ($state['deleted'] as $deletedId) { + $this->deleted[(int) $deletedId] = true; + } + } foreach ($state['nodes'] as $nodeId => $nodeData) { $node = new Node((int) $nodeId, $nodeData['vector'], $nodeData['maxLayer']); diff --git a/src/Persistence/DocumentStore.php b/src/Persistence/DocumentStore.php index 5ca6560..475c61b 100644 --- a/src/Persistence/DocumentStore.php +++ b/src/Persistence/DocumentStore.php @@ -20,7 +20,13 @@ */ final class DocumentStore { - /** @var int[] PIDs of outstanding async child processes. */ + /** + * PIDs of outstanding async child processes, keyed by nodeId. + * Keying by nodeId lets waitForNode() drain exactly one write without + * blocking every other in-flight write. + * + * @var array nodeId → PID + */ private array $pendingPids = []; public function __construct(private readonly string $docsDir) {} @@ -59,8 +65,8 @@ public function write( $this->writeSync($nodeId, $docId, $text, $metadata); exit(0); } else { - // Parent: record PID and return. - $this->pendingPids[] = $pid; + // Parent: record PID keyed by nodeId and return. + $this->pendingPids[$nodeId] = $pid; return; } } @@ -69,6 +75,25 @@ public function write( $this->writeSync($nodeId, $docId, $text, $metadata); } + /** + * Block until the async write for a specific node has completed. + * + * Use this before deleting a node's file so a late child write cannot + * recreate {nodeId}.bin after the unlink(). + */ + public function waitForNode(int $nodeId): void + { + if (!isset($this->pendingPids[$nodeId])) { + return; + } + + if (function_exists('pcntl_waitpid')) { + pcntl_waitpid($this->pendingPids[$nodeId], $status); + } + + unset($this->pendingPids[$nodeId]); + } + /** * Block until every outstanding async write has completed. * Must be called before index files are written (see VectorDatabase::save()). diff --git a/src/VectorDatabase.php b/src/VectorDatabase.php index 271b0d2..825c20c 100644 --- a/src/VectorDatabase.php +++ b/src/VectorDatabase.php @@ -167,6 +167,97 @@ public function addDocuments(array $documents): void } } + /** + * Delete a document by its user-visible ID. + * + * The document is soft-deleted from HNSW (excluded from results but kept + * for graph connectivity) and fully removed from the BM25 index. + * + * When persistence is enabled, a tombstone marker is written immediately so + * the deletion survives a crash. The physical doc file is removed during + * the next `save()` call, after the indexes are fully updated on disk. + * Call `save()` afterward to persist the updated index state. + * + * @param string|int $id The document ID to delete. + * @return bool True if the document was deleted, false if it didn't exist. + */ + public function deleteDocument(string|int $id): bool + { + if (!isset($this->docIdToNodeId[$id])) { + return false; + } + + $nodeId = $this->docIdToNodeId[$id]; + + // Soft-delete from HNSW (node stays for connectivity, excluded from results). + $deletedFromHnsw = $this->hnswIndex->delete($nodeId); + if ($deletedFromHnsw !== true) { + throw new \RuntimeException( + sprintf('Failed to delete node "%s" from HNSW index.', (string) $nodeId) + ); + } + + // Fully remove from BM25. + $this->bm25Index->removeDocument($nodeId); + + // Remove from local caches. + unset($this->nodeIdToDoc[$nodeId]); + unset($this->docIdToNodeId[$id]); + + // Mark the document for physical deletion when persistence is enabled. + if ($this->path !== null) { + // An async pcntl_fork child may still be writing {nodeId}.bin. + // Wait for it to finish so the file is fully on disk before we + // record the tombstone — this keeps the pair (bin + tombstone) + // consistent from the moment the tombstone is created. + $this->getDocumentStore()->waitForNode($nodeId); + + // Write a tombstone instead of immediately removing the doc file. + // The physical removal happens in save() AFTER the index files have + // been updated, giving us crash-safety: + // • crash before save() → open() finds the tombstone and + // re-applies the deletion in memory. + // • crash during save() → at worst the doc file is an orphan; + // the indexes already reflect the deletion. + $tombstone = $this->path . '/docs/' . $nodeId . '.tombstone'; + if (file_put_contents($tombstone, '') === false) { + throw new \RuntimeException("Failed to write tombstone file: {$tombstone}"); + } + } + + return true; + } + + /** + * Update a document by replacing it entirely. + * + * This is equivalent to deleteDocument() followed by addDocument() with the + * same ID. The document gets a new internal nodeId, so this is effectively + * a delete + insert operation. + * + * @param Document $document The updated document. Must have the same ID as an existing document. + * @return bool True if the document was updated, false if it didn't exist. + * @throws \RuntimeException if the document has no ID. + */ + public function updateDocument(Document $document): bool + { + if ($document->id === null) { + throw new \RuntimeException('Cannot update a document without an ID.'); + } + + if (!isset($this->docIdToNodeId[$document->id])) { + return false; + } + + // Delete the old document. + $this->deleteDocument($document->id); + + // Insert the new version. + $this->addDocument($document); + + return true; + } + // ------------------------------------------------------------------ // Search // ------------------------------------------------------------------ @@ -272,9 +363,11 @@ public function hybridSearch( * 2. `meta.json` — distance code, dimension, nextId, docIdToNodeId. * 3. `hnsw.bin` — HNSW graph (vectors + connections). * 4. `bm25.bin` — BM25 inverted index. + * 5. Removes `docs/{n}.bin` + `docs/{n}.tombstone` for every pending deletion. * * Individual `docs/{n}.bin` files are written incrementally by `addDocument()` - * and are NOT re-written by this method. + * and are NOT re-written by this method. Deletion of doc files is deferred + * to this method so the on-disk state is always consistent. * * @throws \RuntimeException if no path was configured or on I/O failure. */ @@ -307,6 +400,7 @@ public function save(): void 'docIdToNodeId' => $this->docIdToNodeId, 'entryPoint' => $hnswState['entryPoint'], 'maxLayer' => $hnswState['maxLayer'], + 'deleted' => $hnswState['deleted'], ]; if (file_put_contents($this->path . '/meta.json', json_encode($meta, JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR)) === false) { throw new \RuntimeException("Failed to write meta.json in: {$this->path}"); @@ -315,6 +409,18 @@ public function save(): void $serializer = new IndexSerializer(); $serializer->writeHnsw($this->path . '/hnsw.bin', $hnswState); $serializer->writeBm25($this->path . '/bm25.bin', $this->bm25Index->exportState()); + + // Now that all index files reflect the current state, it is safe to + // physically remove doc files for pending tombstone deletions. + $docsDir = $this->path . '/docs'; + foreach (glob($docsDir . '/*.tombstone') ?: [] as $tombstoneFile) { + $nodeId = (int) basename($tombstoneFile, '.tombstone'); + $binFile = $docsDir . '/' . $nodeId . '.bin'; + if (file_exists($binFile)) { + @unlink($binFile); + } + @unlink($tombstoneFile); + } } /** @@ -376,6 +482,7 @@ public static function open( // HNSW needs these in $documents[] to return SearchResult objects. $hnswState = $hnswData; $hnswState['documents'] = []; + $hnswState['deleted'] = $meta['deleted'] ?? []; foreach ($hnswData['nodes'] as $nodeId => $nodeData) { $docId = $nodeIdToDocId[$nodeId] ?? $nodeId; $hnswState['documents'][$nodeId] = [ @@ -400,6 +507,32 @@ public static function open( // $db->nodeIdToDoc intentionally starts EMPTY — documents are lazy-loaded. + // ── Reconcile crash-interrupted deletions ───────────────────────── + // A tombstone file docs/{nodeId}.tombstone is written by deleteDocument() + // before save() is called. If the process crashed between those two + // steps the tombstone survives but the indexes were not yet updated. + // Re-apply the pending deletion now so the loaded state is consistent. + $docsDir = $path . '/docs'; + if (is_dir($docsDir)) { + foreach (glob($docsDir . '/*.tombstone') ?: [] as $tombstoneFile) { + $nodeId = (int) basename($tombstoneFile, '.tombstone'); + + // Apply the deletion only when the node is still present in the + // loaded indexes (i.e., save() had not yet been called). + if (isset($nodeIdToDocId[$nodeId])) { + $docId = $nodeIdToDocId[$nodeId]; + $db->hnswIndex->delete($nodeId); + $db->bm25Index->removeDocument($nodeId); + unset($db->docIdToNodeId[$docId]); + } + + // Always clean up — covers the edge case where the process + // crashed after indexes were written but before file removal. + @unlink($docsDir . '/' . $nodeId . '.bin'); + @unlink($tombstoneFile); + } + } + return $db; } @@ -407,10 +540,10 @@ public static function open( // Utilities // ------------------------------------------------------------------ - /** Total number of documents stored. */ + /** Total number of active (non-deleted) documents stored. */ public function count(): int { - return $this->nextId; + return $this->hnswIndex->count(); } // ------------------------------------------------------------------ diff --git a/tests/PersistenceTest.php b/tests/PersistenceTest.php index 2166017..f3875a0 100644 --- a/tests/PersistenceTest.php +++ b/tests/PersistenceTest.php @@ -387,4 +387,82 @@ public function testIncrementalSave(): void $ids = array_map(fn($r) => $r->document->id, $results); self::assertContains(1, $ids); } + + // ------------------------------------------------------------------ + // Delete persistence + // ------------------------------------------------------------------ + + public function testDeletedDocumentsArePersistedAndExcluded(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'first')); + $db->addDocument(new Document(id: 2, vector: [0.9, 0.1], text: 'second')); + $db->addDocument(new Document(id: 3, vector: [0.0, 1.0], text: 'third')); + + // Delete document 1 + $db->deleteDocument(1); + $db->save(); + + // Reload and verify + $loaded = $this->openDb(); + self::assertSame(2, $loaded->count()); + + // Document 1 should not appear in results + $results = $loaded->vectorSearch([1.0, 0.0], k: 3); + $ids = array_map(fn($r) => $r->document->id, $results); + self::assertNotContains(1, $ids); + self::assertContains(2, $ids); + } + + public function testDeletedDocumentFileIsRemoved(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'to delete')); + $db->addDocument(new Document(id: 2, vector: [0.0, 1.0], text: 'to keep')); + $db->save(); + + // Verify doc files exist after initial save. + self::assertFileExists($this->tmpDir . '/docs/0.bin'); + self::assertFileExists($this->tmpDir . '/docs/1.bin'); + + // Delete document 1 (nodeId 0). + // Physical removal is deferred to save() for crash-safety; a tombstone + // is written immediately so the deletion survives a crash. + $db->deleteDocument(1); + + self::assertFileExists($this->tmpDir . '/docs/0.tombstone', 'Tombstone must be created by deleteDocument().'); + self::assertFileExists($this->tmpDir . '/docs/0.bin', 'Doc file must NOT be removed before save().'); + self::assertFileExists($this->tmpDir . '/docs/1.bin'); + + // After save() both the doc file and the tombstone must be gone. + $db->save(); + + self::assertFileDoesNotExist($this->tmpDir . '/docs/0.bin'); + self::assertFileDoesNotExist($this->tmpDir . '/docs/0.tombstone'); + self::assertFileExists($this->tmpDir . '/docs/1.bin'); + } + + public function testUpdateDocumentPersistsCorrectly(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'original')); + $db->save(); + + // Update the document + $db->updateDocument(new Document( + id: 1, + vector: [0.0, 1.0], + text: 'updated content', + metadata: ['version' => 2], + )); + $db->save(); + + // Reload and verify + $loaded = $this->openDb(); + $results = $loaded->vectorSearch([0.0, 1.0], k: 1); + + self::assertSame(1, $results[0]->document->id); + self::assertSame('updated content', $results[0]->document->text); + self::assertSame(['version' => 2], $results[0]->document->metadata); + } } diff --git a/tests/VectorDatabaseTest.php b/tests/VectorDatabaseTest.php index 4330009..35c9f62 100644 --- a/tests/VectorDatabaseTest.php +++ b/tests/VectorDatabaseTest.php @@ -236,4 +236,130 @@ public function testMetadataIsPreservedInResults(): void self::assertSame(['color' => 'red', 'year' => 2024], $results[0]->document->metadata); } + + // ------------------------------------------------------------------ + // Delete document + // ------------------------------------------------------------------ + + public function testDeleteDocumentReducesCount(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'first')); + $db->addDocument(new Document(id: 2, vector: [0.0, 1.0], text: 'second')); + + self::assertSame(2, $db->count()); + + $result = $db->deleteDocument(1); + self::assertTrue($result); + self::assertSame(1, $db->count()); + } + + public function testDeleteDocumentExcludesFromVectorSearch(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'first')); + $db->addDocument(new Document(id: 2, vector: [0.99, 0.01], text: 'second')); + $db->addDocument(new Document(id: 3, vector: [0.0, 1.0], text: 'third')); + + // Before delete, doc 1 should be closest to [1.0, 0.0] + $results = $db->vectorSearch([1.0, 0.0], 1); + self::assertSame(1, $results[0]->document->id); + + // Delete doc 1 + $db->deleteDocument(1); + + // Now doc 2 should be closest + $results = $db->vectorSearch([1.0, 0.0], 1); + self::assertSame(2, $results[0]->document->id); + } + + public function testDeleteDocumentExcludesFromTextSearch(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'unique keyword here')); + $db->addDocument(new Document(id: 2, vector: [0.0, 1.0], text: 'something else')); + + // Before delete + $results = $db->textSearch('unique keyword', 5); + self::assertCount(1, $results); + self::assertSame(1, $results[0]->document->id); + + // Delete doc 1 + $db->deleteDocument(1); + + // After delete: no results for that keyword + $results = $db->textSearch('unique keyword', 5); + self::assertCount(0, $results); + } + + public function testDeleteNonexistentDocumentReturnsFalse(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'hello')); + + self::assertFalse($db->deleteDocument(999)); + self::assertFalse($db->deleteDocument('nonexistent')); + } + + public function testDeleteAllowsReinsertWithSameId(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 'doc', vector: [1.0, 0.0], text: 'original')); + + $db->deleteDocument('doc'); + + // Should not throw - ID is now available + $db->addDocument(new Document(id: 'doc', vector: [0.0, 1.0], text: 'replacement')); + + $results = $db->textSearch('replacement', 1); + self::assertCount(1, $results); + self::assertSame('doc', $results[0]->document->id); + } + + // ------------------------------------------------------------------ + // Update document + // ------------------------------------------------------------------ + + public function testUpdateDocumentChangesContent(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'original content')); + + $result = $db->updateDocument(new Document( + id: 1, + vector: [0.0, 1.0], + text: 'updated content', + metadata: ['version' => 2], + )); + + self::assertTrue($result); + self::assertSame(1, $db->count()); + + // Text search should find updated content + $results = $db->textSearch('updated content', 1); + self::assertCount(1, $results); + self::assertSame(1, $results[0]->document->id); + self::assertSame(['version' => 2], $results[0]->document->metadata); + + // Vector search should use new vector + $results = $db->vectorSearch([0.0, 1.0], 1); + self::assertSame(1, $results[0]->document->id); + } + + public function testUpdateNonexistentDocumentReturnsFalse(): void + { + $db = $this->makeDb(); + + $result = $db->updateDocument(new Document(id: 999, vector: [1.0, 0.0], text: 'new')); + self::assertFalse($result); + } + + public function testUpdateDocumentWithoutIdThrows(): void + { + $db = $this->makeDb(); + $db->addDocument(new Document(id: 1, vector: [1.0, 0.0], text: 'hello')); + + $this->expectException(\RuntimeException::class); + $db->updateDocument(new Document(vector: [0.0, 1.0], text: 'no id')); + } }