Skip to content

Commit 43d728d

Browse files
jpheinclaude
andcommitted
refactor(backends/chroma): coerce None metadatas at the boundary (cherry-pick of MemPalace#1094)
Fork main was carrying the per-site `meta = meta or {}` guards from MemPalace#999 in eight read paths but didn't have the boundary coercion that closes the issue once for all callers. Cherry-picked MemPalace#1094 (open upstream, jp-authored) so the typed QueryResult/GetResult contract — both declare `metadatas: list[dict]`, never `list[Optional[dict]]` — is honored at the chromadb adapter boundary. Three same-family fork-ahead PRs we filed since (MemPalace#1198 _tokenize None-doc, MemPalace#1201 palace_graph None metadata, MemPalace#1199 review) all pointed at gaps that would have been impossible if this pattern had been in place. Future None-metadata bugs of the same shape are now catchable at one site instead of N. Per-site guards are kept as belt-and-suspenders for paths that might bypass the typed wrappers (e.g. legacy `import chromadb` direct calls). A future cleanup PR can remove them once we're confident no caller reaches chromadb directly. 6 new tests in test_backends.py (MemPalace#1094 originals); composes cleanly with fork main's _segment_appears_healthy + _quarantined_paths from earlier today. Full suite 1383/1383. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 366a9ad commit 43d728d

2 files changed

Lines changed: 123 additions & 1 deletion

File tree

mempalace/backends/chroma.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,10 +480,21 @@ def query(
480480
def _none_list_to_empty(outer):
481481
return [(inner or []) for inner in outer]
482482

483+
# Coerce individual None metadata dicts to {} at the backend
484+
# boundary. ChromaDB 1.5.x can return None inside metadatas[i][j]
485+
# even when the write stored a dict — #999, #1013, and other
486+
# fixes added per-site `meta = meta or {}` guards scattered
487+
# across searcher.py, layers.py, and miner.py to tolerate this.
488+
# Handling the coercion once here means every downstream caller
489+
# receives a guaranteed list[dict], matching the type contract
490+
# QueryResult.metadatas declares.
491+
def _coerce_none_metas(outer):
492+
return [[(m if m is not None else {}) for m in (inner or [])] for inner in outer]
493+
483494
return QueryResult(
484495
ids=_none_list_to_empty(ids),
485496
documents=_none_list_to_empty(documents),
486-
metadatas=_none_list_to_empty(metadatas),
497+
metadatas=_coerce_none_metas(metadatas),
487498
distances=_none_list_to_empty(distances),
488499
embeddings=(
489500
[list(inner) for inner in embeddings_raw]
@@ -538,6 +549,16 @@ def get(
538549
if spec.metadatas and len(out_metas) < len(out_ids):
539550
out_metas = out_metas + [{}] * (len(out_ids) - len(out_metas))
540551

552+
# Coerce any individual None metadata dict to {} at the backend
553+
# boundary. Same rationale as the parallel path in query():
554+
# chromadb 1.5.x can return None inside the metadatas list even
555+
# when writes stored a dict, and per-site `meta = meta or {}`
556+
# guards across #999 / #1013 compensated. Doing this once here
557+
# means callers receive a guaranteed list[dict], matching the
558+
# type contract GetResult.metadatas declares.
559+
if spec.metadatas:
560+
out_metas = [(m if m is not None else {}) for m in out_metas]
561+
541562
return GetResult(
542563
ids=out_ids,
543564
documents=out_docs,

tests/test_backends.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,107 @@ def test_chroma_collection_rejects_unknown_where_operator():
129129
collection.query(query_texts=["q"], where={"$regex": "foo"})
130130

131131

132+
def test_chroma_collection_query_coerces_none_metadatas_to_empty_dict():
133+
"""ChromaDB 1.5.x can return None inside metadatas[i][j] even when
134+
the write stored a dict. The backend boundary coerces each None to
135+
{} so downstream callers receive a guaranteed list[dict]. Prevents
136+
`AttributeError: 'NoneType' object has no attribute 'get'` at every
137+
read site (searcher.py, layers.py, miner.status, etc.)."""
138+
raw = {
139+
"ids": [["a", "b", "c"]],
140+
"documents": [["da", "db", "dc"]],
141+
"metadatas": [[{"wing": "w1"}, None, {"wing": "w3"}]],
142+
"distances": [[0.1, 0.2, 0.3]],
143+
}
144+
collection = ChromaCollection(_FakeCollection(query_response=raw))
145+
146+
result = collection.query(query_texts=["q"])
147+
148+
assert result.metadatas == [[{"wing": "w1"}, {}, {"wing": "w3"}]]
149+
# Ids/docs/distances unaffected — their inner None handling is already
150+
# covered by the outer-list coercion.
151+
assert result.ids == [["a", "b", "c"]]
152+
assert result.distances == [[0.1, 0.2, 0.3]]
153+
154+
155+
def test_chroma_collection_query_coerces_all_none_inner_metadatas():
156+
"""Degenerate case: every metadata dict in the batch is None. Each
157+
position still yields {} rather than leaving Nones for callers."""
158+
raw = {
159+
"ids": [["a", "b"]],
160+
"documents": [["da", "db"]],
161+
"metadatas": [[None, None]],
162+
"distances": [[0.1, 0.2]],
163+
}
164+
collection = ChromaCollection(_FakeCollection(query_response=raw))
165+
166+
result = collection.query(query_texts=["q"])
167+
168+
assert result.metadatas == [[{}, {}]]
169+
170+
171+
def test_chroma_collection_query_coerces_none_across_multiple_queries():
172+
"""Two queries in one call, None dicts mixed through both."""
173+
raw = {
174+
"ids": [["a", "b"], ["c", "d"]],
175+
"documents": [["da", "db"], ["dc", "dd"]],
176+
"metadatas": [[{"wing": "w1"}, None], [None, {"wing": "w4"}]],
177+
"distances": [[0.1, 0.2], [0.3, 0.4]],
178+
}
179+
collection = ChromaCollection(_FakeCollection(query_response=raw))
180+
181+
result = collection.query(query_texts=["q1", "q2"])
182+
183+
assert result.metadatas == [[{"wing": "w1"}, {}], [{}, {"wing": "w4"}]]
184+
185+
186+
def test_chroma_collection_get_coerces_none_metadatas_to_empty_dict():
187+
"""Same None-coercion guarantee on the get() path. Same rationale as
188+
query(): callers zip ids with metadatas and reach `.get("wing")`
189+
without first checking for None."""
190+
raw = {
191+
"ids": ["a", "b", "c"],
192+
"documents": ["da", "db", "dc"],
193+
"metadatas": [{"wing": "w1"}, None, {"wing": "w3"}],
194+
}
195+
collection = ChromaCollection(_FakeCollection(get_response=raw))
196+
197+
result = collection.get()
198+
199+
assert result.metadatas == [{"wing": "w1"}, {}, {"wing": "w3"}]
200+
201+
202+
def test_chroma_collection_get_coerces_padding_remains_dict():
203+
"""Padding short metadata lists to match ids length already uses {}.
204+
Regression guard: the new None-coercion step must not convert those
205+
padded {} into something else or re-introduce Nones."""
206+
raw = {
207+
"ids": ["a", "b", "c"],
208+
"documents": ["da", "db", "dc"],
209+
"metadatas": [{"wing": "w1"}], # short — needs 2 padded {}
210+
}
211+
collection = ChromaCollection(_FakeCollection(get_response=raw))
212+
213+
result = collection.get()
214+
215+
assert result.metadatas == [{"wing": "w1"}, {}, {}]
216+
217+
218+
def test_chroma_collection_get_without_metadatas_requested_stays_empty():
219+
"""When the caller omits metadatas from include=, the boundary must
220+
not fabricate coerced dicts — empty list in, empty list out."""
221+
raw = {
222+
"ids": ["a"],
223+
"documents": ["da"],
224+
"metadatas": [{"wing": "w1"}], # present in raw but not requested
225+
}
226+
collection = ChromaCollection(_FakeCollection(get_response=raw))
227+
228+
result = collection.get(include=["documents"])
229+
230+
assert result.metadatas == []
231+
232+
132233
def test_chroma_collection_delegates_writes():
133234
fake = _FakeCollection()
134235
collection = ChromaCollection(fake)

0 commit comments

Comments
 (0)