@@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False):
201201 print (f" Collection size: { before :,} → { after :,} " )
202202
203203
204- def rebuild_index (palace_path = None ):
204+ # ChromaDB's ``collection.get()`` enforces an internal default ``limit``
205+ # of 10 000 rows when the caller does not pass one. We pass an explicit
206+ # ``limit=batch_size`` below, but the underlying segment also caps reads
207+ # during stale/quarantined-HNSW recovery flows: extraction silently stops
208+ # at exactly 10 000 even on palaces with many more rows. Refusing to
209+ # overwrite when this exact value comes back is the simplest signal we
210+ # can detect without depending on chromadb internals.
211+ CHROMADB_DEFAULT_GET_LIMIT = 10_000
212+
213+
214+ class TruncationDetected (Exception ):
215+ """Raised by :func:`check_extraction_safety` when extraction looks short.
216+
217+ Carries the human-readable abort message so callers (CLI ``cmd_repair``,
218+ ``rebuild_index``) can print and exit consistently without re-deriving
219+ the wording.
220+ """
221+
222+ def __init__ (self , message : str , sqlite_count : "int | None" , extracted : int ):
223+ super ().__init__ (message )
224+ self .message = message
225+ self .sqlite_count = sqlite_count
226+ self .extracted = extracted
227+
228+
229+ def check_extraction_safety (
230+ palace_path : str , extracted : int , confirm_truncation_ok : bool = False
231+ ) -> None :
232+ """Cross-check that ``extracted`` matches the SQLite ground truth.
233+
234+ Two signals trip the guard:
235+
236+ 1. **Strong** — ``chroma.sqlite3`` reports more drawers than were
237+ extracted. This is the user-reported #1208 case: 67 580 on disk,
238+ 10 000 came back through the chromadb collection layer, repair
239+ would have destroyed the difference.
240+ 2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT``
241+ AND the SQLite check couldn't run (schema drift, locked file).
242+ Hitting the chromadb default ``get()`` cap exactly is suspicious
243+ enough to refuse without explicit acknowledgement.
244+
245+ Raises :class:`TruncationDetected` with a printable message when the
246+ guard fires. Does nothing on safe extractions or when
247+ ``confirm_truncation_ok`` is set.
248+ """
249+ if confirm_truncation_ok :
250+ return
251+
252+ sqlite_count = sqlite_drawer_count (palace_path )
253+ cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
254+
255+ if sqlite_count is not None and sqlite_count > extracted :
256+ loss = sqlite_count - extracted
257+ pct = 100 * loss / sqlite_count
258+ message = (
259+ f"\n ABORT: chroma.sqlite3 reports { sqlite_count :,} drawers but only { extracted :,} \n "
260+ " came back through the chromadb collection layer. The segment metadata is\n "
261+ " stale (often after manual HNSW quarantine) — proceeding would silently\n "
262+ f" destroy { loss :,} drawers (~{ pct :.0f} %).\n "
263+ "\n "
264+ " Recovery options:\n "
265+ " 1. Restore from your most recent palace backup, then re-mine.\n "
266+ " 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n "
267+ " rebuild the palace from source files.\n "
268+ " 3. If you have independently confirmed the palace really contains only\n "
269+ f" { extracted :,} drawers, re-run with --confirm-truncation-ok.\n "
270+ )
271+ raise TruncationDetected (message , sqlite_count , extracted )
272+
273+ if cap_signal and sqlite_count is None :
274+ message = (
275+ f"\n ABORT: extracted exactly { CHROMADB_DEFAULT_GET_LIMIT :,} drawers, which matches\n "
276+ " ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n "
277+ " be cross-checked from this Python context, so we can't tell whether the\n "
278+ f" palace genuinely holds { CHROMADB_DEFAULT_GET_LIMIT :,} rows or whether extraction was\n "
279+ " silently capped. Refusing to overwrite the palace.\n "
280+ "\n "
281+ " If you have independently confirmed (e.g. via direct sqlite3 query) that\n "
282+ f" the palace really contains exactly { CHROMADB_DEFAULT_GET_LIMIT :,} drawers, re-run with\n "
283+ " --confirm-truncation-ok.\n "
284+ )
285+ raise TruncationDetected (message , sqlite_count , extracted )
286+
287+
288+ def sqlite_drawer_count (palace_path : str ) -> "int | None" :
289+ """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
290+
291+ Used as an independent ground-truth check against the chromadb
292+ collection-layer ``count()`` / ``get()``: when the on-disk SQLite
293+ row count exceeds the extraction count, the segment metadata is
294+ stale and repair would destroy the difference.
295+
296+ Returns ``None`` when the schema isn't readable (chromadb version
297+ drift, missing tables, locked file). Callers treat ``None`` as
298+ "unknown" and fall back to the cap-detection check.
299+ """
300+ sqlite_path = os .path .join (palace_path , "chroma.sqlite3" )
301+ if not os .path .exists (sqlite_path ):
302+ return None
303+ try :
304+ import sqlite3
305+
306+ conn = sqlite3 .connect (f"file:{ sqlite_path } ?mode=ro" , uri = True )
307+ try :
308+ row = conn .execute (
309+ """
310+ SELECT COUNT(*)
311+ FROM embeddings e
312+ JOIN segments s ON e.segment_id = s.id
313+ JOIN collections c ON s.collection = c.id
314+ WHERE c.name = ?
315+ """ ,
316+ (COLLECTION_NAME ,),
317+ ).fetchone ()
318+ return int (row [0 ]) if row and row [0 ] is not None else None
319+ finally :
320+ conn .close ()
321+ except Exception :
322+ # chromadb schema differs by version (segments / collections column
323+ # names occasionally rename). Silent fallback is correct here —
324+ # the cap-detection check still catches the user-reported case.
325+ return None
326+
327+
328+ def rebuild_index (palace_path = None , confirm_truncation_ok : bool = False ):
205329 """Rebuild the HNSW index from scratch.
206330
207331 1. Extract all drawers via ChromaDB get()
208- 2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
209- 3. Delete and recreate the collection with hnsw:space=cosine
210- 4. Upsert all drawers back
332+ 2. Cross-check against the SQLite ground truth (#1208 guard)
333+ 3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
334+ 4. Delete and recreate the collection with hnsw:space=cosine
335+ 5. Upsert all drawers back
336+
337+ ``confirm_truncation_ok`` overrides the safety guard from step 2.
338+ Set to ``True`` only when you have independently verified that the
339+ palace genuinely contains exactly the extracted number of drawers
340+ (typically only a concern for palaces sized at exactly 10 000 rows).
211341 """
212342 palace_path = palace_path or _get_palace_path ()
213343
@@ -252,6 +382,16 @@ def rebuild_index(palace_path=None):
252382 offset += len (batch ["ids" ])
253383 print (f" Extracted { len (all_ids )} drawers" )
254384
385+ # ── #1208 guard ──────────────────────────────────────────────────
386+ # Refuse to ``delete_collection`` + rebuild when extraction looks
387+ # short of the SQLite ground truth (or when extraction == chromadb
388+ # default get() cap and the SQLite check couldn't run).
389+ try :
390+ check_extraction_safety (palace_path , len (all_ids ), confirm_truncation_ok )
391+ except TruncationDetected as e :
392+ print (e .message )
393+ return
394+
255395 # Back up ONLY the SQLite database, not the bloated HNSW files
256396 sqlite_path = os .path .join (palace_path , "chroma.sqlite3" )
257397 backup_path = sqlite_path + ".backup"
0 commit comments