Skip to content

Commit 7538299

Browse files
fungufrobot-rocket-science
andcommitted
fix: prevent HNSW index bloat via batch_size + sync_threshold metadata
Sets `hnsw:batch_size` and `hnsw:sync_threshold` to 50_000 on collection creation in both `get_collection(..., create=True)` and the legacy `create_collection()` path. Preserves existing `hnsw:space` and `hnsw:num_threads=1` (race fix from MemPalace#976) and the `**ef_kwargs` plumbing for embedding-function injection (perf fix from MemPalace#1148/a4868a3). Without these defaults, mining ~10K+ drawers triggers ~30 HNSW index resizes and hundreds of persistDirty() calls. persistDirty uses relative seek positioning in link_lists.bin; accumulated seek drift across resize cycles causes the OS to extend the sparse file with zero-filled regions, each cycle compounding the next. Result: link_lists.bin grows into hundreds of GB sparse, after which `status`, `search`, and `repair` all segfault and the palace is unrecoverable. Empirical: rebuilt a palace from scratch on 39,792 drawers across 5 wings with this fix applied. Final palace 376 MB, link_lists.bin stays at 0 bytes across both Chroma collection dirs, status and search both return cleanly. Same workload without the fix bloated the palace to 565 GB sparse (30 GB on disk) and segfaulted at ~15K drawers. Migration note: chromadb treats HNSW config as immutable post-creation, so existing bloated palaces still need to be nuked and re-mined; this only protects fresh collections. Tests assert both keys land on the persisted collection metadata in both code paths, which also covers the MemPalace#1161 "config silently dropped" concern at CI time. Closes MemPalace#344 Supersedes MemPalace#346 Co-authored-by: robot-rocket-science <robot-rocket-science@users.noreply.github.com>
1 parent 0d9929c commit 7538299

2 files changed

Lines changed: 69 additions & 2 deletions

File tree

mempalace/backends/chroma.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,29 @@
2727
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
2828
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
2929

30+
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
31+
#
32+
# With default params (batch_size=100, sync_threshold=1000, initial capacity
33+
# 1000), inserting tens of thousands of drawers triggers ~30 index resizes
34+
# and hundreds of persistDirty() calls. persistDirty uses relative seek
35+
# positioning in link_lists.bin; accumulated seek drift across resize cycles
36+
# causes the OS to extend the sparse file with zero-filled regions, each
37+
# cycle compounding the next. Result: link_lists.bin grows into hundreds of
38+
# GB sparse, after which `status`/`search`/`repair` segfault.
39+
#
40+
# Setting large batch and sync thresholds at collection creation defers
41+
# persistence until a single large batch completes, breaking the resize+
42+
# persist feedback loop. Empirically validated on a 39,792-drawer rebuild
43+
# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04.
44+
#
45+
# Note: chromadb treats HNSW config as immutable post-creation. Existing
46+
# bloated palaces still need to be nuked and re-mined; this only protects
47+
# fresh collections.
48+
_HNSW_BLOAT_GUARD = {
49+
"hnsw:batch_size": 50_000,
50+
"hnsw:sync_threshold": 50_000,
51+
}
52+
3053

3154
def _validate_where(where: Optional[dict]) -> None:
3255
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -596,7 +619,11 @@ def get_collection(
596619
if create:
597620
collection = client.get_or_create_collection(
598621
collection_name,
599-
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
622+
metadata={
623+
"hnsw:space": hnsw_space,
624+
"hnsw:num_threads": 1,
625+
**_HNSW_BLOAT_GUARD,
626+
},
600627
**ef_kwargs,
601628
)
602629
else:
@@ -646,7 +673,11 @@ def create_collection(
646673
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
647674
collection = self._client(palace_path).create_collection(
648675
collection_name,
649-
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
676+
metadata={
677+
"hnsw:space": hnsw_space,
678+
"hnsw:num_threads": 1,
679+
**_HNSW_BLOAT_GUARD,
680+
},
650681
**ef_kwargs,
651682
)
652683
return ChromaCollection(collection)

tests/test_backends.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
335335
assert col.metadata.get("hnsw:space") == "cosine"
336336

337337

338+
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
339+
"""The HNSW guard from #344 must land on freshly-created collection metadata.
340+
341+
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
342+
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
343+
and segfaults `status` / `search` / `repair`. ChromaDB treats HNSW config
344+
as immutable post-creation, so the only place to set this is at create
345+
time. Asserting both keys land on the persisted metadata also covers the
346+
#1161 "config silently dropped" concern at CI time.
347+
"""
348+
palace_path = tmp_path / "palace"
349+
350+
ChromaBackend().get_collection(
351+
str(palace_path),
352+
collection_name="mempalace_drawers",
353+
create=True,
354+
)
355+
356+
client = chromadb.PersistentClient(path=str(palace_path))
357+
col = client.get_collection("mempalace_drawers")
358+
assert col.metadata.get("hnsw:batch_size") == 50_000
359+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
360+
361+
362+
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
363+
"""Same guard must apply via the legacy create_collection() path."""
364+
palace_path = tmp_path / "palace"
365+
366+
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
367+
368+
client = chromadb.PersistentClient(path=str(palace_path))
369+
col = client.get_collection("mempalace_drawers")
370+
assert col.metadata.get("hnsw:batch_size") == 50_000
371+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
372+
373+
338374
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
339375
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
340376
db_path = tmp_path / "chroma.sqlite3"

0 commit comments

Comments
 (0)