@@ -334,6 +334,68 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
334334 assert col .metadata .get ("hnsw:space" ) == "cosine"
335335
336336
337+ def test_chroma_backend_sets_hnsw_bloat_guard_on_creation (tmp_path ):
338+ """The HNSW guard from #344 must land on freshly-created collection metadata.
339+
340+ Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
341+ resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
342+ and segfaults `status` / `search` / `repair`. The guard belongs at
343+ collection-creation time so every fresh palace gets it without needing
344+ a runtime retrofit. Asserting both keys land on the persisted metadata
345+ also covers the #1161 "config silently dropped" concern at CI time.
346+ """
347+ palace_path = tmp_path / "palace"
348+
349+ ChromaBackend ().get_collection (
350+ str (palace_path ),
351+ collection_name = "mempalace_drawers" ,
352+ create = True ,
353+ )
354+
355+ client = chromadb .PersistentClient (path = str (palace_path ))
356+ col = client .get_collection ("mempalace_drawers" )
357+ assert col .metadata .get ("hnsw:batch_size" ) == 50_000
358+ assert col .metadata .get ("hnsw:sync_threshold" ) == 50_000
359+
360+
361+ def test_chroma_backend_create_collection_sets_hnsw_bloat_guard (tmp_path ):
362+ """Same guard must apply via the legacy create_collection() path."""
363+ palace_path = tmp_path / "palace"
364+
365+ ChromaBackend ().create_collection (str (palace_path ), "mempalace_drawers" )
366+
367+ client = chromadb .PersistentClient (path = str (palace_path ))
368+ col = client .get_collection ("mempalace_drawers" )
369+ assert col .metadata .get ("hnsw:batch_size" ) == 50_000
370+ assert col .metadata .get ("hnsw:sync_threshold" ) == 50_000
371+
372+
373+ def test_get_collection_create_true_is_idempotent (tmp_path ):
374+ """Calling get_collection(create=True) twice on the same name must not crash.
375+
376+ ChromaDB 1.5.x's Rust bindings SIGSEGV when get_or_create_collection is
377+ called with metadata that differs from the stored collection metadata. The
378+ fix splits the call into get_collection -> fallback create_collection so the
379+ metadata-comparison codepath in chromadb_rust_bindings is never reached for
380+ existing collections. Regression guard for issue #1089.
381+ """
382+ palace = str (tmp_path / "palace" )
383+ backend = ChromaBackend ()
384+ backend .get_collection (palace , collection_name = "mempalace_drawers" , create = True )
385+ col2 = backend .get_collection (palace , collection_name = "mempalace_drawers" , create = True )
386+ assert isinstance (col2 , ChromaCollection )
387+
388+
389+ def test_get_collection_create_true_preserves_existing_metadata (tmp_path ):
390+ """Existing collection metadata is not overwritten when reopened with create=True."""
391+ palace = str (tmp_path / "palace" )
392+ backend = ChromaBackend ()
393+ backend .get_collection (palace , collection_name = "mempalace_drawers" , create = True )
394+ col = backend .get_collection (palace , collection_name = "mempalace_drawers" , create = True )
395+ assert col ._collection .metadata ["hnsw:space" ] == "cosine"
396+ assert col ._collection .metadata .get ("hnsw:batch_size" ) == 50_000
397+
398+
337399def test_fix_blob_seq_ids_converts_blobs_to_integers (tmp_path ):
338400 """Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
339401 db_path = tmp_path / "chroma.sqlite3"
0 commit comments