Skip to content

Commit 452630e

Browse files
committed
fix(repair): refuse to overwrite when extraction looks truncated (#1208)
The user-reported case in #1208: a palace with 67,580 drawers had its HNSW files manually quarantined to recover from corruption. ``mempalace repair`` then ran cleanly and reported "Drawers found: 10000 ... Repair complete. 10000 drawers rebuilt." Backup was the v3.3.3 chroma.sqlite3 that did contain the full 67,580 — but the rebuilt collection only had the first 10K. 85% data loss, no warning. Root cause: ChromaDB's collection-layer get() silently caps at ``CHROMADB_DEFAULT_GET_LIMIT = 10_000`` rows when reading from a collection whose segment metadata is stale (typical post-quarantine state). col.count() returns the same capped value, so neither the loop bound nor the extraction count flagged the truncation. Fix is defense-in-depth, not a recovery mechanism. Repair now: 1. After extraction, queries chroma.sqlite3 directly via a read-only sqlite3 connection: COUNT(*) FROM embeddings JOIN segments JOIN collections WHERE name='mempalace_drawers'. If that count exceeds the extracted count, abort with a clear message before any destructive operation. 2. Falls back to a weaker check when the SQLite query can't run (chromadb schema drift, locked file): if extracted exactly equals CHROMADB_DEFAULT_GET_LIMIT, that's a strong-enough cap signal to refuse without explicit acknowledgement. 3. Adds ``--confirm-truncation-ok`` (CLI) and ``confirm_truncation_ok`` (rebuild_index kwarg) to override after independent verification. Useful for the rare case of a palace genuinely sized at exactly 10,000 drawers. The guard logic lives in ``repair.check_extraction_safety()`` so the two extraction paths (CLI ``cmd_repair`` and the lower-level ``rebuild_index``) share a single implementation. Raises ``TruncationDetected`` carrying the printable message. Tests: 9 new cases covering the safe path (counts match, SQLite unreadable but well under cap), both abort paths (SQLite higher than extracted, unreadable + at cap), the override flag, and end-to-end behavior of ``rebuild_index`` with the guard wired in. Plus two ``sqlite_drawer_count`` tests for the missing-file and bad-schema cases. What's NOT in this PR: actually recovering the missing 57,580 drawers from the user's case. The on-disk SQLite still holds them; recovery is a separate flow (direct-extract from chroma.sqlite3, bypass the chromadb collection layer entirely). This PR's job is to stop repair from making it worse. Refs #1208.
1 parent 5e57404 commit 452630e

3 files changed

Lines changed: 297 additions & 6 deletions

File tree

mempalace/cli.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ def cmd_repair(args):
410410
import shutil
411411
from .backends.chroma import ChromaBackend
412412
from .migrate import confirm_destructive_action, contains_palace_database
413+
from .repair import TruncationDetected, check_extraction_safety
413414

414415
palace_path = os.path.abspath(
415416
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -466,6 +467,23 @@ def cmd_repair(args):
466467
offset += len(batch["ids"])
467468
print(f" Extracted {len(all_ids)} drawers")
468469

470+
# ── #1208 guard ──────────────────────────────────────────────────
471+
# Cross-check against the SQLite ground truth before doing anything
472+
# destructive. Catches the user-reported case where chromadb's
473+
# collection-layer get() silently caps at 10,000 rows even on much
474+
# larger palaces (e.g. after manual HNSW quarantine). Override with
475+
# --confirm-truncation-ok only after independently verifying the
476+
# extraction count is real.
477+
try:
478+
check_extraction_safety(
479+
palace_path,
480+
len(all_ids),
481+
confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
482+
)
483+
except TruncationDetected as e:
484+
print(e.message)
485+
return
486+
469487
# Backup and rebuild
470488
palace_path = os.path.normpath(palace_path)
471489
backup_path = palace_path + ".backup"
@@ -868,10 +886,23 @@ def main():
868886
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
869887

870888
# repair
871-
sub.add_parser(
889+
p_repair = sub.add_parser(
872890
"repair",
873891
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
874-
).add_argument("--yes", action="store_true", help="Skip confirmation for destructive changes")
892+
)
893+
p_repair.add_argument(
894+
"--yes", action="store_true", help="Skip confirmation for destructive changes"
895+
)
896+
p_repair.add_argument(
897+
"--confirm-truncation-ok",
898+
action="store_true",
899+
help=(
900+
"Override the #1208 safety guard. Required when chromadb's collection-layer "
901+
"extraction returns exactly 10,000 drawers and the SQLite ground-truth check "
902+
"either matches or can't be read. Use only after independently confirming "
903+
"the palace really contains that count."
904+
),
905+
)
875906

876907
# mcp
877908
sub.add_parser(

mempalace/repair.py

Lines changed: 144 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False):
201201
print(f" Collection size: {before:,}{after:,}")
202202

203203

204-
def rebuild_index(palace_path=None):
204+
# ChromaDB's ``collection.get()`` enforces an internal default ``limit``
205+
# of 10 000 rows when the caller does not pass one. We pass an explicit
206+
# ``limit=batch_size`` below, but the underlying segment also caps reads
207+
# during stale/quarantined-HNSW recovery flows: extraction silently stops
208+
# at exactly 10 000 even on palaces with many more rows. Refusing to
209+
# overwrite when this exact value comes back is the simplest signal we
210+
# can detect without depending on chromadb internals.
211+
CHROMADB_DEFAULT_GET_LIMIT = 10_000
212+
213+
214+
class TruncationDetected(Exception):
215+
"""Raised by :func:`check_extraction_safety` when extraction looks short.
216+
217+
Carries the human-readable abort message so callers (CLI ``cmd_repair``,
218+
``rebuild_index``) can print and exit consistently without re-deriving
219+
the wording.
220+
"""
221+
222+
def __init__(self, message: str, sqlite_count: "int | None", extracted: int):
223+
super().__init__(message)
224+
self.message = message
225+
self.sqlite_count = sqlite_count
226+
self.extracted = extracted
227+
228+
229+
def check_extraction_safety(
230+
palace_path: str, extracted: int, confirm_truncation_ok: bool = False
231+
) -> None:
232+
"""Cross-check that ``extracted`` matches the SQLite ground truth.
233+
234+
Two signals trip the guard:
235+
236+
1. **Strong** — ``chroma.sqlite3`` reports more drawers than were
237+
extracted. This is the user-reported #1208 case: 67 580 on disk,
238+
10 000 came back through the chromadb collection layer, repair
239+
would have destroyed the difference.
240+
2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT``
241+
AND the SQLite check couldn't run (schema drift, locked file).
242+
Hitting the chromadb default ``get()`` cap exactly is suspicious
243+
enough to refuse without explicit acknowledgement.
244+
245+
Raises :class:`TruncationDetected` with a printable message when the
246+
guard fires. Does nothing on safe extractions or when
247+
``confirm_truncation_ok`` is set.
248+
"""
249+
if confirm_truncation_ok:
250+
return
251+
252+
sqlite_count = sqlite_drawer_count(palace_path)
253+
cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
254+
255+
if sqlite_count is not None and sqlite_count > extracted:
256+
loss = sqlite_count - extracted
257+
pct = 100 * loss / sqlite_count
258+
message = (
259+
f"\n ABORT: chroma.sqlite3 reports {sqlite_count:,} drawers but only {extracted:,}\n"
260+
" came back through the chromadb collection layer. The segment metadata is\n"
261+
" stale (often after manual HNSW quarantine) — proceeding would silently\n"
262+
f" destroy {loss:,} drawers (~{pct:.0f}%).\n"
263+
"\n"
264+
" Recovery options:\n"
265+
" 1. Restore from your most recent palace backup, then re-mine.\n"
266+
" 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n"
267+
" rebuild the palace from source files.\n"
268+
" 3. If you have independently confirmed the palace really contains only\n"
269+
f" {extracted:,} drawers, re-run with --confirm-truncation-ok.\n"
270+
)
271+
raise TruncationDetected(message, sqlite_count, extracted)
272+
273+
if cap_signal and sqlite_count is None:
274+
message = (
275+
f"\n ABORT: extracted exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, which matches\n"
276+
" ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n"
277+
" be cross-checked from this Python context, so we can't tell whether the\n"
278+
f" palace genuinely holds {CHROMADB_DEFAULT_GET_LIMIT:,} rows or whether extraction was\n"
279+
" silently capped. Refusing to overwrite the palace.\n"
280+
"\n"
281+
" If you have independently confirmed (e.g. via direct sqlite3 query) that\n"
282+
f" the palace really contains exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, re-run with\n"
283+
" --confirm-truncation-ok.\n"
284+
)
285+
raise TruncationDetected(message, sqlite_count, extracted)
286+
287+
288+
def sqlite_drawer_count(palace_path: str) -> "int | None":
289+
"""Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
290+
291+
Used as an independent ground-truth check against the chromadb
292+
collection-layer ``count()`` / ``get()``: when the on-disk SQLite
293+
row count exceeds the extraction count, the segment metadata is
294+
stale and repair would destroy the difference.
295+
296+
Returns ``None`` when the schema isn't readable (chromadb version
297+
drift, missing tables, locked file). Callers treat ``None`` as
298+
"unknown" and fall back to the cap-detection check.
299+
"""
300+
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
301+
if not os.path.exists(sqlite_path):
302+
return None
303+
try:
304+
import sqlite3
305+
306+
conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
307+
try:
308+
row = conn.execute(
309+
"""
310+
SELECT COUNT(*)
311+
FROM embeddings e
312+
JOIN segments s ON e.segment_id = s.id
313+
JOIN collections c ON s.collection = c.id
314+
WHERE c.name = ?
315+
""",
316+
(COLLECTION_NAME,),
317+
).fetchone()
318+
return int(row[0]) if row and row[0] is not None else None
319+
finally:
320+
conn.close()
321+
except Exception:
322+
# chromadb schema differs by version (segments / collections column
323+
# names occasionally rename). Silent fallback is correct here —
324+
# the cap-detection check still catches the user-reported case.
325+
return None
326+
327+
328+
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
205329
"""Rebuild the HNSW index from scratch.
206330
207331
1. Extract all drawers via ChromaDB get()
208-
2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
209-
3. Delete and recreate the collection with hnsw:space=cosine
210-
4. Upsert all drawers back
332+
2. Cross-check against the SQLite ground truth (#1208 guard)
333+
3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
334+
4. Delete and recreate the collection with hnsw:space=cosine
335+
5. Upsert all drawers back
336+
337+
``confirm_truncation_ok`` overrides the safety guard from step 2.
338+
Set to ``True`` only when you have independently verified that the
339+
palace genuinely contains exactly the extracted number of drawers
340+
(typically only a concern for palaces sized at exactly 10 000 rows).
211341
"""
212342
palace_path = palace_path or _get_palace_path()
213343

@@ -252,6 +382,16 @@ def rebuild_index(palace_path=None):
252382
offset += len(batch["ids"])
253383
print(f" Extracted {len(all_ids)} drawers")
254384

385+
# ── #1208 guard ──────────────────────────────────────────────────
386+
# Refuse to ``delete_collection`` + rebuild when extraction looks
387+
# short of the SQLite ground truth (or when extraction == chromadb
388+
# default get() cap and the SQLite check couldn't run).
389+
try:
390+
check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
391+
except TruncationDetected as e:
392+
print(e.message)
393+
return
394+
255395
# Back up ONLY the SQLite database, not the bloated HNSW files
256396
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
257397
backup_path = sqlite_path + ".backup"

tests/test_repair.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,123 @@ def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
254254

255255
repair.rebuild_index(palace_path=str(tmp_path))
256256
mock_backend.delete_collection.assert_not_called()
257+
258+
259+
# ── #1208 truncation safety ───────────────────────────────────────────
260+
261+
262+
def test_check_extraction_safety_passes_when_counts_match(tmp_path):
263+
"""SQLite reports same count as extracted → no exception."""
264+
with patch("mempalace.repair.sqlite_drawer_count", return_value=500):
265+
repair.check_extraction_safety(str(tmp_path), 500)
266+
267+
268+
def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
269+
"""SQLite check fails (None) but extraction is well under the cap → safe."""
270+
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
271+
repair.check_extraction_safety(str(tmp_path), 5_000)
272+
273+
274+
def test_check_extraction_safety_aborts_when_sqlite_higher(tmp_path):
275+
"""SQLite reports more than extracted — the user-reported #1208 case."""
276+
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
277+
try:
278+
repair.check_extraction_safety(str(tmp_path), 10_000)
279+
except repair.TruncationDetected as e:
280+
assert e.sqlite_count == 67_580
281+
assert e.extracted == 10_000
282+
assert "67,580" in e.message
283+
assert "10,000" in e.message
284+
assert "57,580" in e.message # the loss number
285+
else:
286+
raise AssertionError("expected TruncationDetected")
287+
288+
289+
def test_check_extraction_safety_aborts_when_unreadable_and_at_cap(tmp_path):
290+
"""SQLite unreadable but extraction == default get() cap → suspicious."""
291+
with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
292+
try:
293+
repair.check_extraction_safety(str(tmp_path), repair.CHROMADB_DEFAULT_GET_LIMIT)
294+
except repair.TruncationDetected as e:
295+
assert e.sqlite_count is None
296+
assert e.extracted == repair.CHROMADB_DEFAULT_GET_LIMIT
297+
assert "10,000" in e.message
298+
else:
299+
raise AssertionError("expected TruncationDetected")
300+
301+
302+
def test_check_extraction_safety_override_skips_check(tmp_path):
303+
"""``confirm_truncation_ok=True`` short-circuits both signals."""
304+
with patch("mempalace.repair.sqlite_drawer_count", return_value=99_999):
305+
# Would normally abort — override allows through
306+
repair.check_extraction_safety(str(tmp_path), 10_000, confirm_truncation_ok=True)
307+
308+
309+
def test_sqlite_drawer_count_returns_none_on_missing_file(tmp_path):
310+
"""Palace dir exists but no chroma.sqlite3 → None, not crash."""
311+
assert repair.sqlite_drawer_count(str(tmp_path)) is None
312+
313+
314+
def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
315+
"""File exists but isn't a chromadb sqlite → None, not crash."""
316+
sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
317+
with open(sqlite_path, "wb") as f:
318+
f.write(b"not a sqlite file at all")
319+
assert repair.sqlite_drawer_count(str(tmp_path)) is None
320+
321+
322+
@patch("mempalace.repair.shutil")
323+
@patch("mempalace.repair.ChromaBackend")
324+
def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
325+
"""rebuild_index honors the safety guard: SQLite says 67k, get() returns
326+
10k → no delete_collection, no upsert, no backup."""
327+
mock_backend = MagicMock()
328+
mock_col = MagicMock()
329+
mock_col.count.return_value = 10_000
330+
# Single page comes back with 10_000 ids
331+
mock_col.get.side_effect = [
332+
{
333+
"ids": [f"id{i}" for i in range(10_000)],
334+
"documents": ["x"] * 10_000,
335+
"metadatas": [{}] * 10_000,
336+
},
337+
{"ids": [], "documents": [], "metadatas": []},
338+
]
339+
mock_backend.get_collection.return_value = mock_col
340+
mock_backend_cls.return_value = mock_backend
341+
342+
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
343+
repair.rebuild_index(palace_path=str(tmp_path))
344+
345+
# Guard fired: nothing destructive happened
346+
mock_backend.delete_collection.assert_not_called()
347+
mock_backend.create_collection.assert_not_called()
348+
mock_shutil.copy2.assert_not_called()
349+
350+
351+
@patch("mempalace.repair.shutil")
352+
@patch("mempalace.repair.ChromaBackend")
353+
def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp_path):
354+
"""Override flag lets repair proceed even when the guard would fire."""
355+
mock_backend = MagicMock()
356+
mock_col = MagicMock()
357+
mock_col.count.return_value = 10_000
358+
mock_col.get.side_effect = [
359+
{
360+
"ids": [f"id{i}" for i in range(10_000)],
361+
"documents": ["x"] * 10_000,
362+
"metadatas": [{}] * 10_000,
363+
},
364+
{"ids": [], "documents": [], "metadatas": []},
365+
]
366+
mock_new_col = MagicMock()
367+
mock_backend.get_collection.return_value = mock_col
368+
mock_backend.create_collection.return_value = mock_new_col
369+
mock_backend_cls.return_value = mock_backend
370+
371+
with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
372+
repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
373+
374+
mock_backend.delete_collection.assert_called_once()
375+
mock_backend.create_collection.assert_called_once()
376+
mock_new_col.upsert.assert_called()

0 commit comments

Comments
 (0)