Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,26 @@ Available providers:
- `ItalianStopWords` - Italian stop words
- `FileStopWords` - Load from file

## Deleting and updating documents

```php
// Delete a document by ID
$deleted = $db->deleteDocument(1); // returns true if found, false otherwise

// Update a document (delete + insert with same ID)
$updated = $db->updateDocument(new Document(
id: 1,
vector: [0.5, 0.5, 0.3, 0.2],
text: 'Updated content here',
metadata: ['version' => 2],
));

// After modifications, call save() to persist
$db->save();
```

Deleted documents are soft-deleted from the HNSW graph (kept for connectivity but excluded from results) and fully removed from the BM25 index. Document files are deleted from disk immediately.

## Custom tokenizer

Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic.
Expand Down
63 changes: 61 additions & 2 deletions src/BM25/Index.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ final class Index
/** @var array<int, Document> nodeId → Document */
private array $documents = [];

/**
* Per-document term list (unique terms only).
* Enables O(|terms in doc|) deletion instead of O(|vocabulary|).
* @var array<int, string[]>
*/
private array $docTerms = [];

public function __construct(
private readonly Config $config = new Config(),
private readonly TokenizerInterface $tokenizer = new SimpleTokenizer(),
Expand Down Expand Up @@ -73,6 +80,9 @@ public function addDocument(int $nodeId, Document $document): void
foreach ($termFreqs as $term => $tf) {
$this->invertedIndex[$term][$nodeId] = $tf;
}

// Track which terms this document contributed so removal is O(|terms in doc|).
$this->docTerms[$nodeId] = array_keys($termFreqs);
}

/**
Expand Down Expand Up @@ -192,6 +202,39 @@ public function count(): int
return count($this->documents);
}

/**
* Remove a document from the index.
*
* @param int $nodeId Internal node-ID of the document to remove.
* @return bool True if the document was removed, false if it didn't exist.
*/
public function removeDocument(int $nodeId): bool
{
if (!isset($this->documents[$nodeId])) {
return false;
}

// Update totalTokens.
if (isset($this->docLengths[$nodeId])) {
$this->totalTokens -= $this->docLengths[$nodeId];
unset($this->docLengths[$nodeId]);
}

// Remove from inverted index — only touch terms this document contained.
foreach ($this->docTerms[$nodeId] ?? [] as $term) {
unset($this->invertedIndex[$term][$nodeId]);
// Remove empty posting lists to save memory.
if (empty($this->invertedIndex[$term])) {
unset($this->invertedIndex[$term]);
}
}
unset($this->docTerms[$nodeId]);

unset($this->documents[$nodeId]);

return true;
}

/** Vocabulary size (unique terms in the index). */
public function vocabularySize(): int
{
Expand All @@ -204,7 +247,8 @@ public function vocabularySize(): int
* @return array{
* totalTokens: int,
* docLengths: array<int, int>,
* invertedIndex: array<string, array<int, int>>
* invertedIndex: array<string, array<int, int>>,
* docTerms: array<int, string[]>
* }
*/
public function exportState(): array
Expand All @@ -213,6 +257,7 @@ public function exportState(): array
'totalTokens' => $this->totalTokens,
'docLengths' => $this->docLengths,
'invertedIndex' => $this->invertedIndex,
'docTerms' => $this->docTerms,
];
}

Expand All @@ -223,7 +268,8 @@ public function exportState(): array
* @param array{
* totalTokens: int,
* docLengths: array<int, int>,
* invertedIndex: array<string, array<int, int>>
* invertedIndex: array<string, array<int, int>>,
* docTerms?: array<int, string[]>
* } $state
* @param array<int, Document> $documents nodeId → Document (from HNSW index)
*/
Expand All @@ -233,5 +279,18 @@ public function importState(array $state, array $documents): void
$this->docLengths = $state['docLengths'];
$this->invertedIndex = $state['invertedIndex'];
$this->documents = $documents;

// Rebuild docTerms from the inverted index when loading older snapshots
// that were persisted before this field was introduced.
if (isset($state['docTerms'])) {
$this->docTerms = $state['docTerms'];
} else {
$this->docTerms = [];
foreach ($this->invertedIndex as $term => $postings) {
foreach (array_keys($postings) as $nId) {
$this->docTerms[$nId][] = $term;
}
}
}
}
}
84 changes: 77 additions & 7 deletions src/HNSW/Index.php
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ final class Index
/** Expected vector dimension (set on first insert). */
private ?int $dimension = null;

/**
* Set of soft-deleted node IDs.
* Deleted nodes remain in the graph for connectivity but are excluded from results.
*
* @var array<int, true>
*/
private array $deleted = [];

/**
* Resolved distance closure — built once in the constructor so the
* per-call match() dispatch is removed from the hot path.
Expand Down Expand Up @@ -353,18 +361,69 @@ public function search(array $query, int $k = 10, ?int $ef = null): array
[$epDist, $ep] = $this->searchLayerGreedy($qv, $ep, $epDist, $lc);
}

// Full beam search at layer 0.
$W = $this->searchLayer($qv, [[$epDist, $ep]], $ef, 0);
// Full beam search at layer 0, retrying with a larger ef when soft-deleted
// nodes shrink the active result set below $k. Doubling ef on each retry
// costs at most O(log(totalNodes / ef)) extra passes in the worst case.
$currentEf = $ef;
$totalNodes = count($this->nodes);

do {
$W = $this->searchLayer($qv, [[$epDist, $ep]], $currentEf, 0);

// Filter out soft-deleted nodes.
if (!empty($this->deleted)) {
$W = array_values(array_filter(
$W,
fn(array $pair) => !isset($this->deleted[$pair[1]])
));
}

// Stop when we have enough active results, or ef already spans all nodes
// (further expansion cannot surface new candidates).
if (count($W) >= $k || $currentEf >= $totalNodes) {
break;
}

$currentEf = min($currentEf * 2, $totalNodes);
} while (true);

// Take the k nearest and convert to SearchResult.
$topK = array_slice($W, 0, $k);
return $this->toSearchResults($topK);
}

/** Total number of documents in the index. */
/**
* Total number of active (non-deleted) documents in the index.
*/
public function count(): int
{
return count($this->nodes);
return count($this->nodes) - count($this->deleted);
}

/**
* Soft-delete a node by its internal ID.
*
* The node remains in the graph (for connectivity) but is excluded from
* search results. This is the standard approach for HNSW deletion as
* physically removing nodes would require expensive graph repairs.
*
* @return bool True if the node was deleted, false if it didn't exist or was already deleted.
*/
public function delete(int $nodeId): bool
{
if (!isset($this->nodes[$nodeId]) || isset($this->deleted[$nodeId])) {
return false;
}

$this->deleted[$nodeId] = true;
return true;
}

/**
* Check if a node has been soft-deleted.
*/
public function isDeleted(int $nodeId): bool
{
return isset($this->deleted[$nodeId]);
}

/**
Expand Down Expand Up @@ -396,7 +455,8 @@ public function getDocuments(): array
* maxLayer: int,
* dimension: int|null,
* nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>,
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>,
* deleted: int[]
* }
*/
public function exportState(): array
Expand Down Expand Up @@ -425,6 +485,7 @@ public function exportState(): array
'dimension' => $this->dimension,
'nodes' => $nodes,
'documents' => $documents,
'deleted' => array_keys($this->deleted),
];
}

Expand All @@ -437,7 +498,8 @@ public function exportState(): array
* maxLayer: int,
* dimension: int|null,
* nodes: array<int, array{maxLayer: int, vector: float[], connections: array<int, int[]>}>,
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>
* documents: array<int, array{id: string|int, text: string|null, metadata: array}>,
* deleted?: int[]
* } $state
*/
public function importState(array $state): void
Expand All @@ -450,6 +512,14 @@ public function importState(array $state): void

$this->nodes = [];
$this->documents = [];
$this->deleted = [];

// Restore deleted set.
if (!empty($state['deleted'])) {
foreach ($state['deleted'] as $deletedId) {
$this->deleted[(int) $deletedId] = true;
}
}

foreach ($state['nodes'] as $nodeId => $nodeData) {
$node = new Node((int) $nodeId, $nodeData['vector'], $nodeData['maxLayer']);
Expand Down
31 changes: 28 additions & 3 deletions src/Persistence/DocumentStore.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
*/
final class DocumentStore
{
/** @var int[] PIDs of outstanding async child processes. */
/**
* PIDs of outstanding async child processes, keyed by nodeId.
* Keying by nodeId lets waitForNode() drain exactly one write without
* blocking every other in-flight write.
*
* @var array<int, int> nodeId → PID
*/
private array $pendingPids = [];

public function __construct(private readonly string $docsDir) {}
Expand Down Expand Up @@ -59,8 +65,8 @@ public function write(
$this->writeSync($nodeId, $docId, $text, $metadata);
exit(0);
} else {
// Parent: record PID and return.
$this->pendingPids[] = $pid;
// Parent: record PID keyed by nodeId and return.
$this->pendingPids[$nodeId] = $pid;
return;
}
}
Expand All @@ -69,6 +75,25 @@ public function write(
$this->writeSync($nodeId, $docId, $text, $metadata);
}

/**
* Block until the async write for a specific node has completed.
*
* Use this before deleting a node's file so a late child write cannot
* recreate {nodeId}.bin after the unlink().
*/
public function waitForNode(int $nodeId): void
{
if (!isset($this->pendingPids[$nodeId])) {
return;
}

if (function_exists('pcntl_waitpid')) {
pcntl_waitpid($this->pendingPids[$nodeId], $status);
}

unset($this->pendingPids[$nodeId]);
}

/**
* Block until every outstanding async write has completed.
* Must be called before index files are written (see VectorDatabase::save()).
Expand Down
Loading
Loading