Skip to content

Commit 366a9ad

Browse files
jpheinclaude
andcommitted
refactor(cli): collection.delete(where=) instead of nuke-and-rebuild
Addresses @igorls's review on MemPalace#1087: 1. **Premise was wrong (MemPalace#521 doesn't apply to delete-by-where).** The original implementation extracted survivors, rmtree'd the palace, and re-inserted into a fresh collection — predicated on the idea that collection.delete(where=...) would trigger the same updatePoint / repairConnectionsForUpdate race as the upsert path in MemPalace#521. It doesn't: chromadb's filter-delete bypasses the HNSW update codepath that races. The simpler approach is correct. 2. **Embedding function preserved.** No more bypassing ChromaBackend.create_collection / _resolve_embedding_function — we never recreate the collection at all. 3. **No data loss on interrupt.** No rmtree means no window where a crash leaves the palace empty. 4. **Routes through the backend.** Uses ChromaBackend.get_collection instead of `import chromadb; PersistentClient(...)` directly. No more `del col, client` reliance on refcount finalization. 5. **Reuses confirm_destructive_action.** Replaces the custom `input(...)` prompt with the migrate.py helper. End-to-end test added: real chromadb palace, real backend, real delete, verify count + surviving ids. Catches the embedding-function regression that was the load-bearing concern in the review. Closes MemPalace#848 narrowly (wing/room delete). Source-file / query / dry-run modes from MemPalace#848's broader ask are deliberately NOT in scope here — filing as follow-ups if there's appetite. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5d8a9de commit 366a9ad

2 files changed

Lines changed: 169 additions & 86 deletions

File tree

mempalace/cli.py

Lines changed: 57 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -419,29 +419,29 @@ def cmd_migrate(args):
419419
def cmd_purge(args):
420420
"""Delete drawers by wing and/or room.
421421
422-
Extracts the drawers to *keep*, nukes the palace directory, and
423-
re-inserts them into a fresh ChromaDB. This avoids HNSW ghost entries
424-
that ChromaDB's in-place ``collection.delete()`` leaves behind, which
425-
cause segfaults on subsequent queries or inserts.
426-
427-
Note: ``--room`` without ``--wing`` purges that room across ALL wings.
428-
429-
Not idempotent — running purge twice on the same criteria will fail the
430-
second time if the first run completed (nothing left to match). The
431-
backup directory is preserved for recovery.
422+
Uses ``collection.delete(where=...)`` — chromadb's filter-delete path
423+
doesn't go through ``updatePoint`` / ``repairConnectionsForUpdate``,
424+
which is the upsert-only race from #521 that an earlier draft of this
425+
command tried to side-step with a nuke-and-rebuild. The simpler path
426+
works without losing drawers if the process is interrupted, without
427+
re-embedding the survivors under a default model, and without
428+
bypassing the backend abstraction.
429+
430+
``--room`` without ``--wing`` purges that room across ALL wings.
431+
Not idempotent — running purge twice on the same criteria prints
432+
"No drawers found" the second time.
432433
"""
433-
import chromadb
434-
import shutil
434+
from .backends.chroma import ChromaBackend
435+
from .migrate import confirm_destructive_action, contains_palace_database
435436

436-
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
437-
try:
438-
client = chromadb.PersistentClient(path=palace_path)
439-
col = client.get_collection("mempalace_drawers")
440-
except Exception:
437+
palace_path = os.path.abspath(
438+
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
439+
)
440+
441+
if not os.path.isdir(palace_path) or not contains_palace_database(palace_path):
441442
print(f"\n No palace found at {palace_path}")
442443
return
443444

444-
where = {}
445445
if args.wing and args.room:
446446
where = {"$and": [{"wing": args.wing}, {"room": args.room}]}
447447
elif args.wing:
@@ -452,83 +452,51 @@ def cmd_purge(args):
452452
print(" Error: specify --wing and/or --room")
453453
return
454454

455-
total = col.count()
455+
backend = ChromaBackend()
456+
try:
457+
col = backend.get_collection(palace_path, "mempalace_drawers")
458+
except Exception as e:
459+
print(f"\n Error reading palace: {e}")
460+
return
456461

457-
# Count matching drawers
458-
match_ids = set()
459-
offset = 0
460-
while True:
461-
batch = col.get(limit=10000, offset=offset, where=where, include=[])
462-
if not batch["ids"]:
463-
break
464-
match_ids.update(batch["ids"])
465-
offset += len(batch["ids"])
462+
# Probe match count via a `where=`-filtered get with no payload.
463+
# ChromaCollection.get returns the typed result; we only need ids.
464+
try:
465+
matched = col.get(where=where, include=[])
466+
except Exception as e:
467+
print(f"\n Error querying drawers: {e}")
468+
return
469+
470+
match_ids = matched.get("ids") if isinstance(matched, dict) else getattr(matched, "ids", [])
471+
match_ids = match_ids or []
472+
match_count = len(match_ids)
473+
474+
label_parts = []
475+
if args.wing:
476+
label_parts.append(f"wing={args.wing}")
477+
if args.room:
478+
label_parts.append(f"room={args.room}")
479+
label = " ".join(label_parts)
466480

467-
if not match_ids:
468-
label = f"wing={args.wing}" if args.wing else ""
469-
if args.room:
470-
label = f"{label} room={args.room}" if label else f"room={args.room}"
481+
if match_count == 0:
471482
print(f"\n No drawers found matching {label}\n")
472483
return
473484

474-
label = f"wing={args.wing}" if args.wing else ""
475-
if args.room:
476-
label = f"{label} room={args.room}" if label else f"room={args.room}"
477-
keep_count = total - len(match_ids)
478-
print(f"\n Found {len(match_ids):,} drawers matching {label}")
479-
print(f" Will keep {keep_count:,} drawers, rebuild index")
485+
print(f"\n Found {match_count:,} drawers matching {label}")
480486

481487
if not args.yes:
482-
confirm = input(f" Purge {len(match_ids):,} drawers? [y/N] ").strip().lower()
483-
if confirm not in ("y", "yes"):
484-
print(" Aborted.")
488+
if not confirm_destructive_action(f"Purge of {match_count:,} drawers", palace_path):
485489
return
486490

487-
# Extract drawers to keep (everything NOT matching the filter)
488-
print(" Extracting drawers to keep...")
489-
keep_ids, keep_docs, keep_metas = [], [], []
490-
offset = 0
491-
batch_size = 5000
492-
while offset < total:
493-
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
494-
if not batch["ids"]:
495-
break
496-
for i, doc_id in enumerate(batch["ids"]):
497-
if doc_id not in match_ids:
498-
keep_ids.append(doc_id)
499-
keep_docs.append(batch["documents"][i])
500-
keep_metas.append(batch["metadatas"][i])
501-
offset += len(batch["ids"])
502-
print(f" Extracted {len(keep_ids):,} drawers to keep")
503-
504-
# Release client before nuking — ChromaDB holds open file handles
505-
# (WAL journal, HNSW mmap) that block rmtree on Windows and some Linux FS.
506-
del col, client
507-
508-
# Nuke and rebuild with clean HNSW index
509-
palace_path = palace_path.rstrip(os.sep)
510-
print(" Rebuilding palace...")
511-
shutil.rmtree(palace_path)
512-
os.makedirs(palace_path, mode=0o700)
513-
514-
new_client = chromadb.PersistentClient(path=palace_path)
515-
new_col = new_client.create_collection(
516-
"mempalace_drawers",
517-
metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
518-
)
519-
520-
filed = 0
521-
for i in range(0, len(keep_ids), batch_size):
522-
end = min(i + batch_size, len(keep_ids))
523-
new_col.add(
524-
documents=keep_docs[i:end],
525-
ids=keep_ids[i:end],
526-
metadatas=keep_metas[i:end],
527-
)
528-
filed += end - i
529-
print(f" Re-filed {filed:,} / {len(keep_ids):,}...", flush=True)
491+
print(" Deleting matching drawers...")
492+
try:
493+
col.delete(where=where)
494+
except Exception as e:
495+
print(f"\n Delete failed: {e}\n")
496+
return
530497

531-
print(f"\n Purged {len(match_ids):,} drawers. Remaining: {new_col.count():,}\n")
498+
remaining = col.count()
499+
print(f"\n Purged {match_count:,} drawers. Remaining: {remaining:,}\n")
532500

533501

534502
def cmd_status(args):
@@ -1078,7 +1046,10 @@ def main():
10781046
"--yes", action="store_true", help="Skip confirmation for destructive changes"
10791047
)
10801048

1081-
p_purge = sub.add_parser("purge", help="Delete drawers by wing and/or room (rebuilds index)")
1049+
p_purge = sub.add_parser(
1050+
"purge",
1051+
help="Delete drawers by wing and/or room (filtered delete via chromadb)",
1052+
)
10821053
p_purge.add_argument("--wing", help="Wing to purge")
10831054
p_purge.add_argument("--room", help="Room to purge (without --wing, purges across ALL wings)")
10841055
p_purge.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompt")

tests/test_cli.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
cmd_init,
1515
cmd_instructions,
1616
cmd_mine,
17+
cmd_purge,
1718
cmd_repair,
1819
cmd_search,
1920
cmd_split,
@@ -48,6 +49,117 @@ def test_cmd_status_custom_palace(mock_config_cls):
4849
mock_miner.status.assert_called_once_with(palace_path=expected)
4950

5051

52+
# ── cmd_purge ──────────────────────────────────────────────────────────
53+
54+
55+
def _make_purge_args(**overrides):
56+
"""Build a Namespace with all purge args set."""
57+
defaults = {"palace": None, "wing": None, "room": None, "yes": True}
58+
defaults.update(overrides)
59+
return argparse.Namespace(**defaults)
60+
61+
62+
@patch("mempalace.cli.MempalaceConfig")
63+
def test_cmd_purge_no_palace_found(mock_config_cls, capsys, tmp_path):
64+
"""Purge prints a clear message when the palace doesn't exist."""
65+
missing = tmp_path / "nonexistent"
66+
mock_config_cls.return_value.palace_path = str(missing)
67+
args = _make_purge_args(wing="any", palace=str(missing))
68+
cmd_purge(args)
69+
out = capsys.readouterr().out
70+
assert "No palace found" in out
71+
72+
73+
@patch("mempalace.cli.MempalaceConfig")
74+
def test_cmd_purge_requires_filter(mock_config_cls, capsys, tmp_path):
75+
"""Purge refuses to run without --wing or --room (no mass-delete safety valve)."""
76+
palace = tmp_path / "palace"
77+
palace.mkdir()
78+
(palace / "chroma.sqlite3").write_text("")
79+
mock_config_cls.return_value.palace_path = str(palace)
80+
args = _make_purge_args(palace=str(palace)) # no wing, no room
81+
cmd_purge(args)
82+
out = capsys.readouterr().out
83+
assert "Error: specify --wing and/or --room" in out
84+
85+
86+
@patch("mempalace.cli.MempalaceConfig")
87+
def test_cmd_purge_no_matches(mock_config_cls, capsys, tmp_path):
88+
"""When the filter matches zero drawers, purge exits cleanly."""
89+
palace = tmp_path / "palace"
90+
palace.mkdir()
91+
(palace / "chroma.sqlite3").write_text("")
92+
mock_config_cls.return_value.palace_path = str(palace)
93+
args = _make_purge_args(wing="empty-wing", palace=str(palace))
94+
95+
mock_col = MagicMock()
96+
mock_col.get.return_value = {"ids": []}
97+
mock_backend = MagicMock()
98+
mock_backend.return_value.get_collection.return_value = mock_col
99+
with patch("mempalace.backends.chroma.ChromaBackend", mock_backend):
100+
cmd_purge(args)
101+
out = capsys.readouterr().out
102+
assert "No drawers found matching" in out
103+
mock_col.delete.assert_not_called()
104+
105+
106+
@patch("mempalace.cli.MempalaceConfig")
107+
def test_cmd_purge_wing_and_room_uses_and_filter(mock_config_cls, tmp_path):
108+
"""Purge builds a $and filter when both --wing and --room are set."""
109+
palace = tmp_path / "palace"
110+
palace.mkdir()
111+
(palace / "chroma.sqlite3").write_text("")
112+
mock_config_cls.return_value.palace_path = str(palace)
113+
args = _make_purge_args(wing="myproj", room="drafts", palace=str(palace))
114+
115+
mock_col = MagicMock()
116+
mock_col.get.return_value = {"ids": []}
117+
mock_backend = MagicMock()
118+
mock_backend.return_value.get_collection.return_value = mock_col
119+
with patch("mempalace.backends.chroma.ChromaBackend", mock_backend):
120+
cmd_purge(args)
121+
first_call = mock_col.get.call_args_list[0]
122+
assert first_call.kwargs["where"] == {"$and": [{"wing": "myproj"}, {"room": "drafts"}]}
123+
124+
125+
def test_cmd_purge_deletes_via_where_clause(tmp_path):
126+
"""End-to-end: purge filters via collection.delete(where=...) — preserves
127+
embedding function, no rmtree, no rebuild. Regression for igorls' #1087
128+
review concerns 1, 2, 3."""
129+
palace = tmp_path / "palace"
130+
palace.mkdir()
131+
# Real chromadb palace via the backend so the embedding function is
132+
# whatever the backend resolves (the actual concern from the review).
133+
from mempalace.backends.chroma import ChromaBackend
134+
135+
backend = ChromaBackend()
136+
col = backend.get_collection(str(palace), "mempalace_drawers", create=True)
137+
col.add(
138+
ids=["k1", "k2", "p1", "p2"],
139+
documents=["keep one", "keep two", "purge one", "purge two"],
140+
metadatas=[
141+
{"wing": "keep", "room": "r"},
142+
{"wing": "keep", "room": "r"},
143+
{"wing": "purge-me", "room": "r"},
144+
{"wing": "purge-me", "room": "r"},
145+
],
146+
)
147+
assert col.count() == 4
148+
149+
args = _make_purge_args(wing="purge-me", palace=str(palace))
150+
with patch("mempalace.cli.MempalaceConfig") as mock_config_cls:
151+
mock_config_cls.return_value.palace_path = str(palace)
152+
cmd_purge(args)
153+
154+
# Re-open through the backend to confirm survivors and that the index
155+
# still works (no nuke, embedding function intact).
156+
col2 = backend.get_collection(str(palace), "mempalace_drawers", create=False)
157+
assert col2.count() == 2
158+
surviving = col2.get(include=["metadatas"])
159+
surviving_ids = surviving.get("ids") if isinstance(surviving, dict) else surviving.ids
160+
assert set(surviving_ids) == {"k1", "k2"}
161+
162+
51163
# ── cmd_search ─────────────────────────────────────────────────────────
52164

53165

0 commit comments

Comments
 (0)