Skip to content

Commit ef6d21e

Browse files
committed
feat(repair): add --mode reconcile to re-embed SQL-only orphan rows
Some chromadb crash modes (chroma-core/chroma#6979) commit embeddings/embedding_metadata rows transactionally to SQL but lose the corresponding HNSW additions, leaving drawers visible to metadata queries but unreachable to vector search. Hit on a real 533k-drawer palace 2026-04-29: a single 04-28 mining run produced 10,404 such SQL-only orphans. PR #1271's pickle-counts gate would not have caught this — the pickle was internally consistent for what landed; the failure mode is a partial-flush ingest, not pickle corruption. `mempalace repair --mode reconcile --segment <uuid>` finds those orphans, embeds their `chroma:document` payloads with the palace's configured EF, extracts existing vectors from `data_level0.bin` (no re-embed of healthy rows), builds a fresh persistent HNSW index containing both, and atomic- swaps it in. Same safety profile as `--mode hnsw`: single-threaded writes, pickle persisted exactly once at end, rollback on any failure. Verified end-to-end against a live 533,342-drawer palace: 533,342 == 533,342 strict zero-diff post-run, self-query passes, search returns reconciled drawers as top hits. Tests: 6 new in tests/test_repair.py — METADATA-sibling auto-detection, dry-run no-mutation, happy-path append + pickle update, no-orphans short-circuit, off-by-one pickle abort, missing-METADATA abort. 54/54 in test_repair.py and 1349/1349 in the full suite pass. ruff clean. No new dependencies.
1 parent 3c0e980 commit ef6d21e

3 files changed

Lines changed: 559 additions & 4 deletions

File tree

mempalace/cli.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,23 @@ def cmd_repair(args):
689689
)
690690
return
691691

692+
if getattr(args, "mode", "legacy") == "reconcile":
693+
if not getattr(args, "segment", None):
694+
print(" --mode reconcile requires --segment <uuid>")
695+
return
696+
from .repair import reconcile_orphan_sql_rows
697+
698+
reconcile_orphan_sql_rows(
699+
palace_path,
700+
segment=args.segment,
701+
metadata_segment=getattr(args, "metadata_segment", None),
702+
max_elements=getattr(args, "max_elements", None),
703+
backup=getattr(args, "backup", True),
704+
dry_run=getattr(args, "dry_run", False),
705+
assume_yes=getattr(args, "yes", False),
706+
)
707+
return
708+
692709
db_path = os.path.join(palace_path, "chroma.sqlite3")
693710

694711
if not os.path.isdir(palace_path):
@@ -1212,20 +1229,21 @@ def main():
12121229
)
12131230
p_repair.add_argument(
12141231
"--mode",
1215-
choices=["legacy", "max-seq-id", "hnsw"],
1232+
choices=["legacy", "max-seq-id", "hnsw", "reconcile"],
12161233
default="legacy",
12171234
help=(
12181235
"legacy: full-palace rebuild (default). "
12191236
"max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim. "
1220-
"hnsw: rebuild one segment from data_level0.bin (issue #1046)."
1237+
"hnsw: rebuild one segment from data_level0.bin (issue #1046). "
1238+
"reconcile: re-embed SQL-only orphan rows into the HNSW segment."
12211239
),
12221240
)
12231241
p_repair.add_argument(
12241242
"--segment",
12251243
default=None,
12261244
help=(
12271245
"Segment UUID. For --mode max-seq-id: repair only that segment. "
1228-
"For --mode hnsw: required, points to <palace>/<uuid>/."
1246+
"For --mode hnsw or reconcile: required, points to <palace>/<uuid>/."
12291247
),
12301248
)
12311249
p_repair.add_argument(
@@ -1236,6 +1254,14 @@ def main():
12361254
"clean values are copied from its max_seq_id table verbatim."
12371255
),
12381256
)
1257+
p_repair.add_argument(
1258+
"--metadata-segment",
1259+
default=None,
1260+
help=(
1261+
"METADATA segment UUID for --mode reconcile "
1262+
"(auto-detected from sibling-segment lookup when omitted)"
1263+
),
1264+
)
12391265
p_repair.add_argument(
12401266
"--max-elements",
12411267
type=int,

mempalace/repair.py

Lines changed: 303 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
from datetime import datetime
5555
from typing import Optional
5656

57-
from .backends.chroma import ChromaBackend
57+
from .backends.chroma import ChromaBackend, hnsw_capacity_status
5858

5959
logger = logging.getLogger(__name__)
6060

@@ -1354,6 +1354,308 @@ def rebuild_hnsw_segment(
13541354
return result
13551355

13561356

1357+
# ---------------------------------------------------------------------------
1358+
# reconcile mode: re-embed SQL-only rows that never landed an HNSW label
1359+
# ---------------------------------------------------------------------------
1360+
1361+
1362+
def _resolve_metadata_segment(db_path: str, vector_segment: str) -> Optional[str]:
1363+
"""Find the METADATA segment that shares a collection with ``vector_segment``."""
1364+
if not os.path.isfile(db_path):
1365+
return None
1366+
with sqlite3.connect(db_path) as conn:
1367+
row = conn.execute(
1368+
"SELECT collection FROM segments WHERE id = ?", (vector_segment,)
1369+
).fetchone()
1370+
if not row:
1371+
return None
1372+
rows = conn.execute(
1373+
"SELECT id FROM segments WHERE collection = ? AND scope = 'METADATA'",
1374+
(row[0],),
1375+
).fetchall()
1376+
if len(rows) != 1:
1377+
return None
1378+
return str(rows[0][0])
1379+
1380+
1381+
def _fetch_sql_only_docs(db_path: str, metadata_segment: str, hnsw_uuids: set) -> list:
1382+
"""Return ``[(embedding_id, document), ...]`` for SQL rows missing from HNSW.
1383+
1384+
Stable order by ``embedding_id`` so reconciles are reproducible.
1385+
"""
1386+
with sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) as conn:
1387+
rows = conn.execute(
1388+
"""
1389+
SELECT e.embedding_id, em.string_value
1390+
FROM embeddings e
1391+
JOIN embedding_metadata em ON e.id = em.id
1392+
WHERE e.segment_id = ? AND em.key = 'chroma:document'
1393+
ORDER BY e.embedding_id
1394+
""",
1395+
(metadata_segment,),
1396+
).fetchall()
1397+
return [(uid, doc) for uid, doc in rows if uid not in hnsw_uuids]
1398+
1399+
1400+
def _embed_in_batches(ef, docs, *, dim: int, batch: int = 64):
1401+
"""Encode ``docs`` with ``ef`` in fixed-size batches; return ``np.ndarray``."""
1402+
import numpy as np
1403+
1404+
out = np.empty((len(docs), dim), dtype=np.float32)
1405+
for i in range(0, len(docs), batch):
1406+
j = min(i + batch, len(docs))
1407+
out[i:j] = np.asarray(ef(docs[i:j]), dtype=np.float32)
1408+
return out
1409+
1410+
1411+
def reconcile_orphan_sql_rows(
1412+
palace_path: str,
1413+
*,
1414+
segment: str,
1415+
metadata_segment: Optional[str] = None,
1416+
max_elements: Optional[int] = None,
1417+
backup: bool = True,
1418+
dry_run: bool = False,
1419+
assume_yes: bool = False,
1420+
embedding_function=None,
1421+
) -> dict:
1422+
"""Re-embed SQL-only embeddings into the HNSW vector ``segment``.
1423+
1424+
Some chromadb crash modes (e.g. issue #6979) commit ``embeddings``/
1425+
``embedding_metadata`` rows transactionally to SQL but lose the
1426+
corresponding HNSW additions, leaving a subset of drawers visible to
1427+
metadata queries but unreachable to vector search. This mode finds
1428+
those orphans, embeds their ``chroma:document`` payloads with the
1429+
palace's configured embedding function, and writes a fresh persistent
1430+
index containing both the existing labels (extracted from
1431+
``data_level0.bin``) and freshly-allocated labels for the SQL-only
1432+
rows. Atomic swap with rollback on failure — same safety profile as
1433+
``--mode hnsw``.
1434+
1435+
``metadata_segment`` is auto-detected from the sibling METADATA
1436+
segment in the ``segments`` table when omitted. ``embedding_function``
1437+
is injectable for tests; production callers should leave it as
1438+
``None`` so the palace's configured EF is resolved.
1439+
"""
1440+
from .migrate import confirm_destructive_action, contains_palace_database
1441+
1442+
palace_path = os.path.abspath(os.path.expanduser(palace_path))
1443+
seg_dir = os.path.join(palace_path, segment)
1444+
db_path = os.path.join(palace_path, "chroma.sqlite3")
1445+
pickle_path = os.path.join(seg_dir, "index_metadata.pickle")
1446+
data_path = os.path.join(seg_dir, "data_level0.bin")
1447+
header_path = os.path.join(seg_dir, "header.bin")
1448+
1449+
result: dict = {
1450+
"palace_path": palace_path,
1451+
"segment": segment,
1452+
"dry_run": dry_run,
1453+
"aborted": False,
1454+
}
1455+
1456+
print(f"\n{'=' * 55}")
1457+
print(" MemPalace Repair — SQL/HNSW Reconcile")
1458+
print(f"{'=' * 55}\n")
1459+
print(f" Palace: {palace_path}")
1460+
print(f" Segment: {segment}")
1461+
1462+
if not os.path.isdir(palace_path):
1463+
print(f" No palace found at {palace_path}")
1464+
result["aborted"] = True
1465+
result["reason"] = "palace-missing"
1466+
return result
1467+
if not contains_palace_database(palace_path):
1468+
print(f" No palace database at {palace_path}")
1469+
result["aborted"] = True
1470+
result["reason"] = "db-missing"
1471+
return result
1472+
if not os.path.isdir(seg_dir):
1473+
print(f" Segment directory not found: {seg_dir}")
1474+
result["aborted"] = True
1475+
result["reason"] = "segment-missing"
1476+
return result
1477+
if not os.path.isfile(pickle_path):
1478+
print(f" index_metadata.pickle not found in {seg_dir}")
1479+
result["aborted"] = True
1480+
result["reason"] = "pickle-missing"
1481+
return result
1482+
1483+
try:
1484+
import hnswlib # noqa: F401
1485+
import numpy # noqa: F401
1486+
except ImportError as e:
1487+
print(f" Required dependency missing: {e}")
1488+
result["aborted"] = True
1489+
result["reason"] = "deps-missing"
1490+
return result
1491+
1492+
if metadata_segment is None:
1493+
metadata_segment = _resolve_metadata_segment(db_path, segment)
1494+
if not metadata_segment:
1495+
print(" Could not resolve sibling METADATA segment — pass --metadata-segment")
1496+
result["aborted"] = True
1497+
result["reason"] = "metadata-segment-unresolved"
1498+
return result
1499+
print(f" Metadata segment: {metadata_segment}")
1500+
1501+
with open(pickle_path, "rb") as f:
1502+
meta = pickle.load(f)
1503+
id_to_label = dict(_meta_get(meta, "id_to_label") or {})
1504+
label_to_id = dict(_meta_get(meta, "label_to_id") or {})
1505+
pickle_total = int(_meta_get(meta, "total_elements_added") or 0)
1506+
if len(id_to_label) != pickle_total:
1507+
print(
1508+
f" Pickle inconsistent: id_to_label={len(id_to_label)} "
1509+
f"vs total_elements_added={pickle_total}. Run --mode hnsw first."
1510+
)
1511+
result["aborted"] = True
1512+
result["reason"] = "pickle-inconsistent"
1513+
return result
1514+
1515+
src = header_path if os.path.isfile(header_path) else data_path
1516+
with open(src, "rb") as f:
1517+
hdr = _parse_hnsw_header(f.read(100))
1518+
space = _detect_space(palace_path, segment)
1519+
1520+
missing = _fetch_sql_only_docs(db_path, metadata_segment, set(id_to_label.keys()))
1521+
print()
1522+
print(" Report")
1523+
print(f" existing labels {len(id_to_label):>10,}")
1524+
print(f" sql-only orphans {len(missing):>10,}")
1525+
print(f" space {space}")
1526+
print(f" dim {hdr.dim}")
1527+
1528+
result.update(
1529+
{
1530+
"existing_labels": len(id_to_label),
1531+
"sql_only_orphans": len(missing),
1532+
"space": space,
1533+
"dim": hdr.dim,
1534+
}
1535+
)
1536+
1537+
if not missing:
1538+
print(" Nothing to reconcile.")
1539+
print(f"\n{'=' * 55}\n")
1540+
return result
1541+
1542+
if dry_run:
1543+
print("\n DRY RUN — no embedding, no HNSW write, no swap.\n" + "=" * 55 + "\n")
1544+
return result
1545+
1546+
if not confirm_destructive_action("Reconcile HNSW segment", palace_path, assume_yes=assume_yes):
1547+
result["aborted"] = True
1548+
result["reason"] = "user-aborted"
1549+
return result
1550+
1551+
if embedding_function is None:
1552+
from .embedding import get_embedding_function
1553+
1554+
embedding_function = get_embedding_function()
1555+
1556+
import numpy as np
1557+
1558+
with open(data_path, "rb") as f:
1559+
data_bytes = f.read()
1560+
raw_labels, raw_vectors = _extract_vectors(data_bytes, hdr)
1561+
del data_bytes
1562+
keep_set = set(id_to_label.values())
1563+
keep_mask = np.array([int(x) in keep_set for x in raw_labels], dtype=bool)
1564+
existing_labels = raw_labels[keep_mask]
1565+
existing_vectors = raw_vectors[keep_mask]
1566+
if len(existing_labels) != len(id_to_label):
1567+
print(f" WARN: extracted healthy ({len(existing_labels)}) != pickle ({len(id_to_label)}).")
1568+
1569+
docs = [d for _uid, d in missing]
1570+
uuids = [u for u, _d in missing]
1571+
new_vectors = _embed_in_batches(embedding_function, docs, dim=hdr.dim)
1572+
max_label = max(id_to_label.values()) if id_to_label else 0
1573+
new_labels = np.arange(max_label + 1, max_label + 1 + len(missing), dtype=np.int64)
1574+
new_total = len(id_to_label) + len(missing)
1575+
new_max = _compute_max_elements(new_total, max_elements)
1576+
if int(new_labels[-1]) >= new_max:
1577+
new_max = max(new_max, int(new_labels[-1]) + 1)
1578+
1579+
all_vectors = np.concatenate([existing_vectors, new_vectors])
1580+
all_labels = np.concatenate([existing_labels.astype(np.int64), new_labels])
1581+
1582+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
1583+
backup_dir: Optional[str] = None
1584+
if backup:
1585+
backup_dir = _backup_segment(palace_path, segment, timestamp)
1586+
print(f" Backup: {backup_dir}")
1587+
1588+
_close_chroma_handles(palace_path)
1589+
1590+
tmpdir = tempfile.mkdtemp(prefix="mempalace_reconcile_", dir=palace_path)
1591+
t0 = time.time()
1592+
try:
1593+
idx = _build_persistent_index(
1594+
all_vectors,
1595+
all_labels,
1596+
space=space,
1597+
dim=hdr.dim,
1598+
max_elements=new_max,
1599+
persistence_location=tmpdir,
1600+
M=hdr.M or 16,
1601+
ef_construction=hdr.ef_construction or 100,
1602+
)
1603+
except Exception:
1604+
shutil.rmtree(tmpdir, ignore_errors=True)
1605+
raise
1606+
build_seconds = time.time() - t0
1607+
1608+
try:
1609+
sample_n = min(3, len(missing))
1610+
existing_n = min(3, len(existing_vectors))
1611+
sv = np.concatenate([existing_vectors[:existing_n], new_vectors[:sample_n]])
1612+
sl = np.concatenate(
1613+
[
1614+
existing_labels[:existing_n].astype(np.int64),
1615+
new_labels[:sample_n],
1616+
]
1617+
)
1618+
_self_query_verify(idx, sv, sl, k=min(10, len(all_labels)))
1619+
except Exception:
1620+
shutil.rmtree(tmpdir, ignore_errors=True)
1621+
raise
1622+
finally:
1623+
del idx
1624+
gc.collect()
1625+
1626+
new_id_to_label = dict(id_to_label)
1627+
new_label_to_id = dict(label_to_id)
1628+
for uid, lbl in zip(uuids, new_labels):
1629+
new_id_to_label[uid] = int(lbl)
1630+
new_label_to_id[int(lbl)] = uid
1631+
_meta_set(meta, "id_to_label", new_id_to_label)
1632+
_meta_set(meta, "label_to_id", new_label_to_id)
1633+
_meta_set(meta, "total_elements_added", new_total)
1634+
with open(os.path.join(tmpdir, "index_metadata.pickle"), "wb") as f:
1635+
pickle.dump(meta, f, protocol=pickle.HIGHEST_PROTOCOL)
1636+
1637+
_atomic_swap_segment(tmpdir, seg_dir)
1638+
1639+
peak_mb = _peak_memory_mb()
1640+
print(
1641+
f"\n Reconcile complete: {len(missing):,} new labels appended in "
1642+
f"{build_seconds:.1f}s (peak RSS ≈ {peak_mb:.0f} MB)"
1643+
)
1644+
print(f" Backup: {backup_dir or '(skipped)'}")
1645+
print(f"\n{'=' * 55}\n")
1646+
1647+
result.update(
1648+
{
1649+
"new_labels": len(missing),
1650+
"total_elements_added": new_total,
1651+
"build_seconds": build_seconds,
1652+
"peak_rss_mb": peak_mb,
1653+
"backup": backup_dir,
1654+
}
1655+
)
1656+
return result
1657+
1658+
13571659
if __name__ == "__main__":
13581660
p = argparse.ArgumentParser(description="MemPalace repair tools")
13591661
p.add_argument("--palace", default=None, help="Palace directory path")

0 commit comments

Comments
 (0)