Skip to content

Commit c8b8a46

Browse files
author
Scorpion
committed
fix(chroma): split get_or_create_collection to prevent reopen SIGSEGV (upstream MemPalace#1262)
1 parent 74eb75e commit c8b8a46

2 files changed

Lines changed: 75 additions & 6 deletions

File tree

mempalace/backends/chroma.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Any, Optional
99

1010
import chromadb
11+
from chromadb.errors import NotFoundError as _ChromaNotFoundError
1112

1213
from .base import (
1314
BaseBackend,
@@ -759,13 +760,19 @@ def get_collection(
759760
ef_kwargs = {}
760761
if embedding_function is not None:
761762
ef_kwargs["embedding_function"] = embedding_function
762-
763763
if create:
764-
collection = client.get_or_create_collection(
765-
collection_name,
766-
metadata={"hnsw:space": hnsw_space, **_HNSW_BLOAT_GUARD},
767-
**ef_kwargs,
768-
)
764+
try:
765+
collection = client.get_collection(collection_name, **ef_kwargs)
766+
except _ChromaNotFoundError:
767+
collection = client.create_collection(
768+
collection_name,
769+
metadata={
770+
"hnsw:space": hnsw_space,
771+
"hnsw:num_threads": 1,
772+
**_HNSW_BLOAT_GUARD,
773+
},
774+
**ef_kwargs,
775+
)
769776
else:
770777
collection = client.get_collection(collection_name, **ef_kwargs)
771778
return ChromaCollection(

tests/test_backends.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,68 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
334334
assert col.metadata.get("hnsw:space") == "cosine"
335335

336336

337+
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
338+
"""The HNSW guard from #344 must land on freshly-created collection metadata.
339+
340+
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
341+
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
342+
and segfaults `status` / `search` / `repair`. The guard belongs at
343+
collection-creation time so every fresh palace gets it without needing
344+
a runtime retrofit. Asserting both keys land on the persisted metadata
345+
also covers the #1161 "config silently dropped" concern at CI time.
346+
"""
347+
palace_path = tmp_path / "palace"
348+
349+
ChromaBackend().get_collection(
350+
str(palace_path),
351+
collection_name="mempalace_drawers",
352+
create=True,
353+
)
354+
355+
client = chromadb.PersistentClient(path=str(palace_path))
356+
col = client.get_collection("mempalace_drawers")
357+
assert col.metadata.get("hnsw:batch_size") == 50_000
358+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
359+
360+
361+
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
362+
"""Same guard must apply via the legacy create_collection() path."""
363+
palace_path = tmp_path / "palace"
364+
365+
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
366+
367+
client = chromadb.PersistentClient(path=str(palace_path))
368+
col = client.get_collection("mempalace_drawers")
369+
assert col.metadata.get("hnsw:batch_size") == 50_000
370+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
371+
372+
373+
def test_get_collection_create_true_is_idempotent(tmp_path):
374+
"""Calling get_collection(create=True) twice on the same name must not crash.
375+
376+
ChromaDB 1.5.x's Rust bindings SIGSEGV when get_or_create_collection is
377+
called with metadata that differs from the stored collection metadata. The
378+
fix splits the call into get_collection -> fallback create_collection so the
379+
metadata-comparison codepath in chromadb_rust_bindings is never reached for
380+
existing collections. Regression guard for issue #1089.
381+
"""
382+
palace = str(tmp_path / "palace")
383+
backend = ChromaBackend()
384+
backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
385+
col2 = backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
386+
assert isinstance(col2, ChromaCollection)
387+
388+
389+
def test_get_collection_create_true_preserves_existing_metadata(tmp_path):
390+
"""Existing collection metadata is not overwritten when reopened with create=True."""
391+
palace = str(tmp_path / "palace")
392+
backend = ChromaBackend()
393+
backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
394+
col = backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
395+
assert col._collection.metadata["hnsw:space"] == "cosine"
396+
assert col._collection.metadata.get("hnsw:batch_size") == 50_000
397+
398+
337399
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
338400
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
339401
db_path = tmp_path / "chroma.sqlite3"

0 commit comments

Comments
 (0)