|
54 | 54 | from datetime import datetime |
55 | 55 | from typing import Optional |
56 | 56 |
|
57 | | -from .backends.chroma import ChromaBackend |
| 57 | +from .backends.chroma import ChromaBackend, hnsw_capacity_status |
58 | 58 |
|
59 | 59 | logger = logging.getLogger(__name__) |
60 | 60 |
|
@@ -1354,6 +1354,308 @@ def rebuild_hnsw_segment( |
1354 | 1354 | return result |
1355 | 1355 |
|
1356 | 1356 |
|
| 1357 | +# --------------------------------------------------------------------------- |
| 1358 | +# reconcile mode: re-embed SQL-only rows that never landed an HNSW label |
| 1359 | +# --------------------------------------------------------------------------- |
| 1360 | + |
| 1361 | + |
| 1362 | +def _resolve_metadata_segment(db_path: str, vector_segment: str) -> Optional[str]: |
| 1363 | + """Find the METADATA segment that shares a collection with ``vector_segment``.""" |
| 1364 | + if not os.path.isfile(db_path): |
| 1365 | + return None |
| 1366 | + with sqlite3.connect(db_path) as conn: |
| 1367 | + row = conn.execute( |
| 1368 | + "SELECT collection FROM segments WHERE id = ?", (vector_segment,) |
| 1369 | + ).fetchone() |
| 1370 | + if not row: |
| 1371 | + return None |
| 1372 | + rows = conn.execute( |
| 1373 | + "SELECT id FROM segments WHERE collection = ? AND scope = 'METADATA'", |
| 1374 | + (row[0],), |
| 1375 | + ).fetchall() |
| 1376 | + if len(rows) != 1: |
| 1377 | + return None |
| 1378 | + return str(rows[0][0]) |
| 1379 | + |
| 1380 | + |
| 1381 | +def _fetch_sql_only_docs(db_path: str, metadata_segment: str, hnsw_uuids: set) -> list: |
| 1382 | + """Return ``[(embedding_id, document), ...]`` for SQL rows missing from HNSW. |
| 1383 | +
|
| 1384 | + Stable order by ``embedding_id`` so reconciles are reproducible. |
| 1385 | + """ |
| 1386 | + with sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) as conn: |
| 1387 | + rows = conn.execute( |
| 1388 | + """ |
| 1389 | + SELECT e.embedding_id, em.string_value |
| 1390 | + FROM embeddings e |
| 1391 | + JOIN embedding_metadata em ON e.id = em.id |
| 1392 | + WHERE e.segment_id = ? AND em.key = 'chroma:document' |
| 1393 | + ORDER BY e.embedding_id |
| 1394 | + """, |
| 1395 | + (metadata_segment,), |
| 1396 | + ).fetchall() |
| 1397 | + return [(uid, doc) for uid, doc in rows if uid not in hnsw_uuids] |
| 1398 | + |
| 1399 | + |
| 1400 | +def _embed_in_batches(ef, docs, *, dim: int, batch: int = 64): |
| 1401 | + """Encode ``docs`` with ``ef`` in fixed-size batches; return ``np.ndarray``.""" |
| 1402 | + import numpy as np |
| 1403 | + |
| 1404 | + out = np.empty((len(docs), dim), dtype=np.float32) |
| 1405 | + for i in range(0, len(docs), batch): |
| 1406 | + j = min(i + batch, len(docs)) |
| 1407 | + out[i:j] = np.asarray(ef(docs[i:j]), dtype=np.float32) |
| 1408 | + return out |
| 1409 | + |
| 1410 | + |
| 1411 | +def reconcile_orphan_sql_rows( |
| 1412 | + palace_path: str, |
| 1413 | + *, |
| 1414 | + segment: str, |
| 1415 | + metadata_segment: Optional[str] = None, |
| 1416 | + max_elements: Optional[int] = None, |
| 1417 | + backup: bool = True, |
| 1418 | + dry_run: bool = False, |
| 1419 | + assume_yes: bool = False, |
| 1420 | + embedding_function=None, |
| 1421 | +) -> dict: |
| 1422 | + """Re-embed SQL-only embeddings into the HNSW vector ``segment``. |
| 1423 | +
|
| 1424 | + Some chromadb crash modes (e.g. issue #6979) commit ``embeddings``/ |
| 1425 | + ``embedding_metadata`` rows transactionally to SQL but lose the |
| 1426 | + corresponding HNSW additions, leaving a subset of drawers visible to |
| 1427 | + metadata queries but unreachable to vector search. This mode finds |
| 1428 | + those orphans, embeds their ``chroma:document`` payloads with the |
| 1429 | + palace's configured embedding function, and writes a fresh persistent |
| 1430 | + index containing both the existing labels (extracted from |
| 1431 | + ``data_level0.bin``) and freshly-allocated labels for the SQL-only |
| 1432 | + rows. Atomic swap with rollback on failure — same safety profile as |
| 1433 | + ``--mode hnsw``. |
| 1434 | +
|
| 1435 | + ``metadata_segment`` is auto-detected from the sibling METADATA |
| 1436 | + segment in the ``segments`` table when omitted. ``embedding_function`` |
| 1437 | + is injectable for tests; production callers should leave it as |
| 1438 | + ``None`` so the palace's configured EF is resolved. |
| 1439 | + """ |
| 1440 | + from .migrate import confirm_destructive_action, contains_palace_database |
| 1441 | + |
| 1442 | + palace_path = os.path.abspath(os.path.expanduser(palace_path)) |
| 1443 | + seg_dir = os.path.join(palace_path, segment) |
| 1444 | + db_path = os.path.join(palace_path, "chroma.sqlite3") |
| 1445 | + pickle_path = os.path.join(seg_dir, "index_metadata.pickle") |
| 1446 | + data_path = os.path.join(seg_dir, "data_level0.bin") |
| 1447 | + header_path = os.path.join(seg_dir, "header.bin") |
| 1448 | + |
| 1449 | + result: dict = { |
| 1450 | + "palace_path": palace_path, |
| 1451 | + "segment": segment, |
| 1452 | + "dry_run": dry_run, |
| 1453 | + "aborted": False, |
| 1454 | + } |
| 1455 | + |
| 1456 | + print(f"\n{'=' * 55}") |
| 1457 | + print(" MemPalace Repair — SQL/HNSW Reconcile") |
| 1458 | + print(f"{'=' * 55}\n") |
| 1459 | + print(f" Palace: {palace_path}") |
| 1460 | + print(f" Segment: {segment}") |
| 1461 | + |
| 1462 | + if not os.path.isdir(palace_path): |
| 1463 | + print(f" No palace found at {palace_path}") |
| 1464 | + result["aborted"] = True |
| 1465 | + result["reason"] = "palace-missing" |
| 1466 | + return result |
| 1467 | + if not contains_palace_database(palace_path): |
| 1468 | + print(f" No palace database at {palace_path}") |
| 1469 | + result["aborted"] = True |
| 1470 | + result["reason"] = "db-missing" |
| 1471 | + return result |
| 1472 | + if not os.path.isdir(seg_dir): |
| 1473 | + print(f" Segment directory not found: {seg_dir}") |
| 1474 | + result["aborted"] = True |
| 1475 | + result["reason"] = "segment-missing" |
| 1476 | + return result |
| 1477 | + if not os.path.isfile(pickle_path): |
| 1478 | + print(f" index_metadata.pickle not found in {seg_dir}") |
| 1479 | + result["aborted"] = True |
| 1480 | + result["reason"] = "pickle-missing" |
| 1481 | + return result |
| 1482 | + |
| 1483 | + try: |
| 1484 | + import hnswlib # noqa: F401 |
| 1485 | + import numpy # noqa: F401 |
| 1486 | + except ImportError as e: |
| 1487 | + print(f" Required dependency missing: {e}") |
| 1488 | + result["aborted"] = True |
| 1489 | + result["reason"] = "deps-missing" |
| 1490 | + return result |
| 1491 | + |
| 1492 | + if metadata_segment is None: |
| 1493 | + metadata_segment = _resolve_metadata_segment(db_path, segment) |
| 1494 | + if not metadata_segment: |
| 1495 | + print(" Could not resolve sibling METADATA segment — pass --metadata-segment") |
| 1496 | + result["aborted"] = True |
| 1497 | + result["reason"] = "metadata-segment-unresolved" |
| 1498 | + return result |
| 1499 | + print(f" Metadata segment: {metadata_segment}") |
| 1500 | + |
| 1501 | + with open(pickle_path, "rb") as f: |
| 1502 | + meta = pickle.load(f) |
| 1503 | + id_to_label = dict(_meta_get(meta, "id_to_label") or {}) |
| 1504 | + label_to_id = dict(_meta_get(meta, "label_to_id") or {}) |
| 1505 | + pickle_total = int(_meta_get(meta, "total_elements_added") or 0) |
| 1506 | + if len(id_to_label) != pickle_total: |
| 1507 | + print( |
| 1508 | + f" Pickle inconsistent: id_to_label={len(id_to_label)} " |
| 1509 | + f"vs total_elements_added={pickle_total}. Run --mode hnsw first." |
| 1510 | + ) |
| 1511 | + result["aborted"] = True |
| 1512 | + result["reason"] = "pickle-inconsistent" |
| 1513 | + return result |
| 1514 | + |
| 1515 | + src = header_path if os.path.isfile(header_path) else data_path |
| 1516 | + with open(src, "rb") as f: |
| 1517 | + hdr = _parse_hnsw_header(f.read(100)) |
| 1518 | + space = _detect_space(palace_path, segment) |
| 1519 | + |
| 1520 | + missing = _fetch_sql_only_docs(db_path, metadata_segment, set(id_to_label.keys())) |
| 1521 | + print() |
| 1522 | + print(" Report") |
| 1523 | + print(f" existing labels {len(id_to_label):>10,}") |
| 1524 | + print(f" sql-only orphans {len(missing):>10,}") |
| 1525 | + print(f" space {space}") |
| 1526 | + print(f" dim {hdr.dim}") |
| 1527 | + |
| 1528 | + result.update( |
| 1529 | + { |
| 1530 | + "existing_labels": len(id_to_label), |
| 1531 | + "sql_only_orphans": len(missing), |
| 1532 | + "space": space, |
| 1533 | + "dim": hdr.dim, |
| 1534 | + } |
| 1535 | + ) |
| 1536 | + |
| 1537 | + if not missing: |
| 1538 | + print(" Nothing to reconcile.") |
| 1539 | + print(f"\n{'=' * 55}\n") |
| 1540 | + return result |
| 1541 | + |
| 1542 | + if dry_run: |
| 1543 | + print("\n DRY RUN — no embedding, no HNSW write, no swap.\n" + "=" * 55 + "\n") |
| 1544 | + return result |
| 1545 | + |
| 1546 | + if not confirm_destructive_action("Reconcile HNSW segment", palace_path, assume_yes=assume_yes): |
| 1547 | + result["aborted"] = True |
| 1548 | + result["reason"] = "user-aborted" |
| 1549 | + return result |
| 1550 | + |
| 1551 | + if embedding_function is None: |
| 1552 | + from .embedding import get_embedding_function |
| 1553 | + |
| 1554 | + embedding_function = get_embedding_function() |
| 1555 | + |
| 1556 | + import numpy as np |
| 1557 | + |
| 1558 | + with open(data_path, "rb") as f: |
| 1559 | + data_bytes = f.read() |
| 1560 | + raw_labels, raw_vectors = _extract_vectors(data_bytes, hdr) |
| 1561 | + del data_bytes |
| 1562 | + keep_set = set(id_to_label.values()) |
| 1563 | + keep_mask = np.array([int(x) in keep_set for x in raw_labels], dtype=bool) |
| 1564 | + existing_labels = raw_labels[keep_mask] |
| 1565 | + existing_vectors = raw_vectors[keep_mask] |
| 1566 | + if len(existing_labels) != len(id_to_label): |
| 1567 | + print(f" WARN: extracted healthy ({len(existing_labels)}) != pickle ({len(id_to_label)}).") |
| 1568 | + |
| 1569 | + docs = [d for _uid, d in missing] |
| 1570 | + uuids = [u for u, _d in missing] |
| 1571 | + new_vectors = _embed_in_batches(embedding_function, docs, dim=hdr.dim) |
| 1572 | + max_label = max(id_to_label.values()) if id_to_label else 0 |
| 1573 | + new_labels = np.arange(max_label + 1, max_label + 1 + len(missing), dtype=np.int64) |
| 1574 | + new_total = len(id_to_label) + len(missing) |
| 1575 | + new_max = _compute_max_elements(new_total, max_elements) |
| 1576 | + if int(new_labels[-1]) >= new_max: |
| 1577 | + new_max = max(new_max, int(new_labels[-1]) + 1) |
| 1578 | + |
| 1579 | + all_vectors = np.concatenate([existing_vectors, new_vectors]) |
| 1580 | + all_labels = np.concatenate([existing_labels.astype(np.int64), new_labels]) |
| 1581 | + |
| 1582 | + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") |
| 1583 | + backup_dir: Optional[str] = None |
| 1584 | + if backup: |
| 1585 | + backup_dir = _backup_segment(palace_path, segment, timestamp) |
| 1586 | + print(f" Backup: {backup_dir}") |
| 1587 | + |
| 1588 | + _close_chroma_handles(palace_path) |
| 1589 | + |
| 1590 | + tmpdir = tempfile.mkdtemp(prefix="mempalace_reconcile_", dir=palace_path) |
| 1591 | + t0 = time.time() |
| 1592 | + try: |
| 1593 | + idx = _build_persistent_index( |
| 1594 | + all_vectors, |
| 1595 | + all_labels, |
| 1596 | + space=space, |
| 1597 | + dim=hdr.dim, |
| 1598 | + max_elements=new_max, |
| 1599 | + persistence_location=tmpdir, |
| 1600 | + M=hdr.M or 16, |
| 1601 | + ef_construction=hdr.ef_construction or 100, |
| 1602 | + ) |
| 1603 | + except Exception: |
| 1604 | + shutil.rmtree(tmpdir, ignore_errors=True) |
| 1605 | + raise |
| 1606 | + build_seconds = time.time() - t0 |
| 1607 | + |
| 1608 | + try: |
| 1609 | + sample_n = min(3, len(missing)) |
| 1610 | + existing_n = min(3, len(existing_vectors)) |
| 1611 | + sv = np.concatenate([existing_vectors[:existing_n], new_vectors[:sample_n]]) |
| 1612 | + sl = np.concatenate( |
| 1613 | + [ |
| 1614 | + existing_labels[:existing_n].astype(np.int64), |
| 1615 | + new_labels[:sample_n], |
| 1616 | + ] |
| 1617 | + ) |
| 1618 | + _self_query_verify(idx, sv, sl, k=min(10, len(all_labels))) |
| 1619 | + except Exception: |
| 1620 | + shutil.rmtree(tmpdir, ignore_errors=True) |
| 1621 | + raise |
| 1622 | + finally: |
| 1623 | + del idx |
| 1624 | + gc.collect() |
| 1625 | + |
| 1626 | + new_id_to_label = dict(id_to_label) |
| 1627 | + new_label_to_id = dict(label_to_id) |
| 1628 | + for uid, lbl in zip(uuids, new_labels): |
| 1629 | + new_id_to_label[uid] = int(lbl) |
| 1630 | + new_label_to_id[int(lbl)] = uid |
| 1631 | + _meta_set(meta, "id_to_label", new_id_to_label) |
| 1632 | + _meta_set(meta, "label_to_id", new_label_to_id) |
| 1633 | + _meta_set(meta, "total_elements_added", new_total) |
| 1634 | + with open(os.path.join(tmpdir, "index_metadata.pickle"), "wb") as f: |
| 1635 | + pickle.dump(meta, f, protocol=pickle.HIGHEST_PROTOCOL) |
| 1636 | + |
| 1637 | + _atomic_swap_segment(tmpdir, seg_dir) |
| 1638 | + |
| 1639 | + peak_mb = _peak_memory_mb() |
| 1640 | + print( |
| 1641 | + f"\n Reconcile complete: {len(missing):,} new labels appended in " |
| 1642 | + f"{build_seconds:.1f}s (peak RSS ≈ {peak_mb:.0f} MB)" |
| 1643 | + ) |
| 1644 | + print(f" Backup: {backup_dir or '(skipped)'}") |
| 1645 | + print(f"\n{'=' * 55}\n") |
| 1646 | + |
| 1647 | + result.update( |
| 1648 | + { |
| 1649 | + "new_labels": len(missing), |
| 1650 | + "total_elements_added": new_total, |
| 1651 | + "build_seconds": build_seconds, |
| 1652 | + "peak_rss_mb": peak_mb, |
| 1653 | + "backup": backup_dir, |
| 1654 | + } |
| 1655 | + ) |
| 1656 | + return result |
| 1657 | + |
| 1658 | + |
1357 | 1659 | if __name__ == "__main__": |
1358 | 1660 | p = argparse.ArgumentParser(description="MemPalace repair tools") |
1359 | 1661 | p.add_argument("--palace", default=None, help="Palace directory path") |
|
0 commit comments