Skip to content

Commit db333f2

Browse files
committed
fix(searcher): wire i18n stop_words through BM25 fallback / union path
scoring site inside _bm25_only_via_sqlite that this PR did not cover -- the original wiring landed before MemPalace#1306 existed. Without the plumbing, two paths silently bypass locale stop_words even when the palace has MEMPALACE_LANG set: - vector_disabled=True (MemPalace#1222 fallback): BM25-only scoring runs without filtering. - candidate_strategy="union": BM25 candidates merged into the rerank pool come from a tokenizer that ignored the configured locale, so the merged-in entries fight the lang-aware _hybrid_rank rerank. Resolution moves once: stop_words = _resolve_stop_words(lang) is hoisted before the vector_disabled branch and threaded through _apply_candidate_strategy, _merge_bm25_union_candidates, and _bm25_only_via_sqlite into _bm25_scores. The FTS5 candidate-selection _tokenize at the top of _bm25_only_via_sqlite is left untouched -- chromadb's FTS5 index is built with a trigram tokenizer that already mismatches our \\w{2,} regex, so dropping further tokens there changes selection semantics in ways outside this PR's scope. Three tests pin the propagation: a sqlite-backed test for _bm25_only_via_sqlite -> _bm25_scores, a unit spy on _apply_candidate_strategy -> merger, and an end-to-end on search_memories(vector_disabled=True) -> _bm25_only_via_sqlite.
1 parent 30138b5 commit db333f2

2 files changed

Lines changed: 127 additions & 4 deletions

File tree

mempalace/searcher.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ def _bm25_only_via_sqlite(
433433
max_candidates: int = 500,
434434
_include_internal: bool = False,
435435
collection_name: str = None,
436+
stop_words: frozenset = frozenset(),
436437
) -> dict:
437438
"""BM25-only search reading drawers directly from chroma.sqlite3.
438439
@@ -637,7 +638,7 @@ def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
637638

638639
# Local BM25 over the candidate set.
639640
docs = [c["text"] for c in candidates]
640-
bm25_raw = _bm25_scores(query, docs)
641+
bm25_raw = _bm25_scores(query, docs, stop_words=stop_words)
641642
max_bm25 = max(bm25_raw) if bm25_raw else 0.0
642643
for c, raw in zip(candidates, bm25_raw):
643644
c["bm25_score"] = round(raw, 3)
@@ -671,6 +672,7 @@ def _merge_bm25_union_candidates(
671672
room: str,
672673
n_results: int,
673674
max_distance: float = 0.0,
675+
stop_words: frozenset = frozenset(),
674676
) -> None:
675677
"""Append top-K BM25-only candidates from sqlite into ``hits`` in place.
676678
@@ -705,6 +707,7 @@ def _merge_bm25_union_candidates(
705707
room=room,
706708
n_results=n_results * 3,
707709
_include_internal=True,
710+
stop_words=stop_words,
708711
).get("results", [])
709712
except Exception:
710713
logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
@@ -763,6 +766,7 @@ def _apply_candidate_strategy(
763766
room: str,
764767
n_results: int,
765768
max_distance: float = 0.0,
769+
stop_words: frozenset = frozenset(),
766770
) -> None:
767771
"""Dispatch to the registered merger for ``strategy``.
768772
@@ -771,7 +775,16 @@ def _apply_candidate_strategy(
771775
"""
772776
merger = _CANDIDATE_MERGERS[strategy]
773777
if merger is not None:
774-
merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
778+
merger(
779+
hits,
780+
query,
781+
palace_path,
782+
wing,
783+
room,
784+
n_results,
785+
max_distance=max_distance,
786+
stop_words=stop_words,
787+
)
775788

776789

777790
def search_memories(
@@ -834,6 +847,11 @@ def search_memories(
834847
# the BM25-only fallback below.
835848
_validate_candidate_strategy(candidate_strategy)
836849

850+
# Resolve stop words once up-front so every BM25 site (the vector path's
851+
# `_hybrid_rank`, the `vector_disabled` fallback, and the union-merge
852+
# candidate gather) tokenizes against the same locale.
853+
stop_words = _resolve_stop_words(lang)
854+
837855
if vector_disabled:
838856
return _bm25_only_via_sqlite(
839857
query,
@@ -842,10 +860,9 @@ def search_memories(
842860
room=room,
843861
n_results=n_results,
844862
collection_name=collection_name,
863+
stop_words=stop_words,
845864
)
846865

847-
stop_words = _resolve_stop_words(lang)
848-
849866
try:
850867
drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
851868
except Exception as e:
@@ -1036,6 +1053,7 @@ def search_memories(
10361053
room,
10371054
n_results,
10381055
max_distance=max_distance,
1056+
stop_words=stop_words,
10391057
)
10401058

10411059
# BM25 hybrid re-rank within the final candidate set, then trim back

tests/test_searcher.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
plus mock-based tests for error paths.
66
"""
77

8+
import sqlite3
89
from unittest.mock import MagicMock, patch
910

1011
import pytest
@@ -522,3 +523,107 @@ class FakeCfgJa:
522523
assert (
523524
"した" in second
524525
), f"cache pinned stale empty set for None after config change; got {second!r}"
526+
527+
528+
# ── stop_words propagation through BM25-only / union-merge paths (post-#1306) ──
529+
#
530+
# #1306 added a second BM25 scoring site inside `_bm25_only_via_sqlite` that
531+
# the original PR didn't cover (it landed on develop after this branch was
532+
# opened). These tests pin the propagation chain so the BM25 fallback and the
533+
# `candidate_strategy="union"` merge tokenize at the same locale as
534+
# `_hybrid_rank`.
535+
536+
537+
def test_bm25_only_via_sqlite_forwards_stop_words_to_bm25_scores(monkeypatch, tmp_path):
538+
"""`_bm25_only_via_sqlite` must pass `stop_words` into `_bm25_scores`.
539+
540+
Without this, vector-disabled (#1222) palaces silently lose stop-word
541+
filtering on the BM25 fallback path.
542+
"""
543+
from mempalace import searcher
544+
545+
captured = {}
546+
real_bm25 = searcher._bm25_scores
547+
548+
def _spy(query, docs, **kwargs):
549+
captured["stop_words"] = kwargs.get("stop_words")
550+
return real_bm25(query, docs, **kwargs)
551+
552+
monkeypatch.setattr(searcher, "_bm25_scores", _spy)
553+
554+
# Build a minimal chroma.sqlite3 with one matching drawer so the scoring
555+
# path is reached. Schema mirrors what `_bm25_only_via_sqlite` reads.
556+
db = tmp_path / "chroma.sqlite3"
557+
conn = sqlite3.connect(db)
558+
conn.executescript(
559+
"""
560+
CREATE VIRTUAL TABLE embedding_fulltext_search USING fts5(string_value, tokenize='trigram');
561+
CREATE TABLE embedding_metadata (id INTEGER, key TEXT, string_value TEXT, int_value INTEGER);
562+
INSERT INTO embedding_fulltext_search (rowid, string_value) VALUES (1, 'the cat sat');
563+
INSERT INTO embedding_metadata VALUES (1, 'chroma:document', 'the cat sat', NULL);
564+
INSERT INTO embedding_metadata VALUES (1, 'wing', 'general', NULL);
565+
INSERT INTO embedding_metadata VALUES (1, 'room', 'inbox', NULL);
566+
INSERT INTO embedding_metadata VALUES (1, 'source_file', '/x/cat.md', NULL);
567+
INSERT INTO embedding_metadata VALUES (1, 'filed_at', '2026-05-03', NULL);
568+
"""
569+
)
570+
conn.commit()
571+
conn.close()
572+
573+
searcher._bm25_only_via_sqlite(
574+
"the cat",
575+
str(tmp_path),
576+
n_results=5,
577+
stop_words=frozenset({"the"}),
578+
)
579+
580+
assert captured["stop_words"] == frozenset({"the"})
581+
582+
583+
def test_apply_candidate_strategy_forwards_stop_words_to_merger(monkeypatch):
584+
"""`_apply_candidate_strategy` must forward `stop_words` to the merger."""
585+
from mempalace import searcher
586+
587+
captured = {}
588+
589+
def _stub_merger(hits, query, palace, wing, room, n, **kwargs):
590+
captured.update(kwargs)
591+
592+
monkeypatch.setitem(searcher._CANDIDATE_MERGERS, "union", _stub_merger)
593+
594+
searcher._apply_candidate_strategy(
595+
"union",
596+
[],
597+
"q",
598+
"/p",
599+
None,
600+
None,
601+
n_results=5,
602+
max_distance=0.0,
603+
stop_words=frozenset({"a", "an"}),
604+
)
605+
606+
assert captured["stop_words"] == frozenset({"a", "an"})
607+
608+
609+
def test_search_memories_vector_disabled_uses_resolved_stop_words(monkeypatch, tmp_path):
610+
"""`vector_disabled=True` must route `_resolve_stop_words(lang)` into the
611+
BM25 fallback, not skip stop-word resolution as it did pre-fix."""
612+
from mempalace import searcher
613+
614+
captured = {}
615+
616+
def _stub(*args, **kwargs):
617+
captured["stop_words"] = kwargs.get("stop_words")
618+
return {"results": [], "fallback": "bm25_only_via_sqlite"}
619+
620+
monkeypatch.setattr(searcher, "_bm25_only_via_sqlite", _stub)
621+
monkeypatch.setattr(searcher, "_resolve_stop_words", lambda lang: frozenset({"de", "es"}))
622+
623+
searcher.search_memories(
624+
query="q",
625+
palace_path=str(tmp_path),
626+
vector_disabled=True,
627+
)
628+
629+
assert captured["stop_words"] == frozenset({"de", "es"})

0 commit comments

Comments
 (0)