@@ -173,7 +173,9 @@ public function addDocuments(array $documents): void
173173 * The document is soft-deleted from HNSW (excluded from results but kept
174174 * for graph connectivity) and fully removed from the BM25 index.
175175 *
176- * When persistence is enabled, the document file is also deleted from disk.
176+ * When persistence is enabled, a tombstone marker is written immediately so
177+ * the deletion survives a crash. The physical doc file is removed during
178+ * the next `save()` call, after the indexes are fully updated on disk.
177179 * Call `save()` afterward to persist the updated index state.
178180 *
179181 * @param string|int $id The document ID to delete.
@@ -202,17 +204,24 @@ public function deleteDocument(string|int $id): bool
202204 unset($ this ->nodeIdToDoc [$ nodeId ]);
203205 unset($ this ->docIdToNodeId [$ id ]);
204206
205- // Delete document file from disk if persistence is enabled.
207+ // Mark the document for physical deletion when persistence is enabled.
206208 if ($ this ->path !== null ) {
207- $ docFile = $ this ->path . '/docs/ ' . $ nodeId . '.bin ' ;
208- if (file_exists ($ docFile )) {
209- // Suppress PHP warning and handle failure explicitly to keep
210- // on-disk state consistent with in-memory indexes.
211- if (!@unlink ($ docFile ) && file_exists ($ docFile )) {
212- throw new \RuntimeException (
213- "Failed to delete persisted document file: {$ docFile }"
214- );
215- }
209+ // An async pcntl_fork child may still be writing {nodeId}.bin.
210+ // Wait for it to finish so the file is fully on disk before we
211+ // record the tombstone — this keeps the pair (bin + tombstone)
212+ // consistent from the moment the tombstone is created.
213+ $ this ->getDocumentStore ()->waitForNode ($ nodeId );
214+
215+ // Write a tombstone instead of immediately removing the doc file.
216+ // The physical removal happens in save() AFTER the index files have
217+ // been updated, giving us crash-safety:
218+ // • crash before save() → open() finds the tombstone and
219+ // re-applies the deletion in memory.
220+ // • crash during save() → at worst the doc file is an orphan;
221+ // the indexes already reflect the deletion.
222+ $ tombstone = $ this ->path . '/docs/ ' . $ nodeId . '.tombstone ' ;
223+ if (file_put_contents ($ tombstone , '' ) === false ) {
224+ throw new \RuntimeException ("Failed to write tombstone file: {$ tombstone }" );
216225 }
217226 }
218227
@@ -354,9 +363,11 @@ public function hybridSearch(
354363 * 2. `meta.json` — distance code, dimension, nextId, docIdToNodeId.
355364 * 3. `hnsw.bin` — HNSW graph (vectors + connections).
356365 * 4. `bm25.bin` — BM25 inverted index.
366+ * 5. Removes `docs/{n}.bin` + `docs/{n}.tombstone` for every pending deletion.
357367 *
358368 * Individual `docs/{n}.bin` files are written incrementally by `addDocument()`
359- * and are NOT re-written by this method.
369+ * and are NOT re-written by this method. Deletion of doc files is deferred
370+ * to this method so the on-disk state is always consistent.
360371 *
361372 * @throws \RuntimeException if no path was configured or on I/O failure.
362373 */
@@ -398,6 +409,18 @@ public function save(): void
398409 $ serializer = new IndexSerializer ();
399410 $ serializer ->writeHnsw ($ this ->path . '/hnsw.bin ' , $ hnswState );
400411 $ serializer ->writeBm25 ($ this ->path . '/bm25.bin ' , $ this ->bm25Index ->exportState ());
412+
413+ // Now that all index files reflect the current state, it is safe to
414+ // physically remove doc files for pending tombstone deletions.
415+ $ docsDir = $ this ->path . '/docs ' ;
416+ foreach (glob ($ docsDir . '/*.tombstone ' ) ?: [] as $ tombstoneFile ) {
417+ $ nodeId = (int ) basename ($ tombstoneFile , '.tombstone ' );
418+ $ binFile = $ docsDir . '/ ' . $ nodeId . '.bin ' ;
419+ if (file_exists ($ binFile )) {
420+ @unlink ($ binFile );
421+ }
422+ @unlink ($ tombstoneFile );
423+ }
401424 }
402425
403426 /**
@@ -484,6 +507,32 @@ public static function open(
484507
485508 // $db->nodeIdToDoc intentionally starts EMPTY — documents are lazy-loaded.
486509
510+ // ── Reconcile crash-interrupted deletions ─────────────────────────
511+ // A tombstone file docs/{nodeId}.tombstone is written by deleteDocument()
512+ // before save() is called. If the process crashed between those two
513+ // steps the tombstone survives but the indexes were not yet updated.
514+ // Re-apply the pending deletion now so the loaded state is consistent.
515+ $ docsDir = $ path . '/docs ' ;
516+ if (is_dir ($ docsDir )) {
517+ foreach (glob ($ docsDir . '/*.tombstone ' ) ?: [] as $ tombstoneFile ) {
518+ $ nodeId = (int ) basename ($ tombstoneFile , '.tombstone ' );
519+
520+ // Apply the deletion only when the node is still present in the
521+ // loaded indexes (i.e., save() had not yet been called).
522+ if (isset ($ nodeIdToDocId [$ nodeId ])) {
523+ $ docId = $ nodeIdToDocId [$ nodeId ];
524+ $ db ->hnswIndex ->delete ($ nodeId );
525+ $ db ->bm25Index ->removeDocument ($ nodeId );
526+ unset($ db ->docIdToNodeId [$ docId ]);
527+ }
528+
529+ // Always clean up — covers the edge case where the process
530+ // crashed after indexes were written but before file removal.
531+ @unlink ($ docsDir . '/ ' . $ nodeId . '.bin ' );
532+ @unlink ($ tombstoneFile );
533+ }
534+ }
535+
487536 return $ db ;
488537 }
489538
0 commit comments