Skip to content

Commit b51261b

Browse files
committed
fix: make entity_registry.research() local-only by default
research() previously called _wikipedia_lookup() unconditionally, sending entity names to en.wikipedia.org on every uncached lookup. This violates the project's local-first and privacy-by-architecture principles documented in CLAUDE.md. Changes: - research() now returns "unknown" for uncached words by default - New allow_network=True parameter required for Wikipedia lookups - Wikipedia 404 now returns "unknown" instead of asserting "person" with 0.70 confidence, preventing entity registry poisoning - Added privacy warning docstring to _wikipedia_lookup() - Added tests for local-only default, opt-in network, 404 handling, and cache-not-persisted-on-local-only behaviour Refs: #809
1 parent b060171 commit b51261b

2 files changed

Lines changed: 100 additions & 11 deletions

File tree

mempalace/entity_registry.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,12 @@ def _wikipedia_lookup(word: str) -> dict:
178178
Look up a word via Wikipedia REST API.
179179
Returns inferred type (person/place/concept/unknown) + confidence + summary.
180180
Free, no API key, handles disambiguation pages.
181+
182+
**Privacy warning:** This function makes an outbound HTTPS request to
183+
en.wikipedia.org, sending the queried word over the network. It should
184+
only be called when the caller has explicitly opted in via
185+
``allow_network=True`` in :meth:`EntityRegistry.research`. The default
186+
behaviour of ``research()`` is local-only (no network calls).
181187
"""
182188
try:
183189
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
@@ -244,13 +250,14 @@ def _wikipedia_lookup(word: str) -> dict:
244250

245251
except urllib.error.HTTPError as e:
246252
if e.code == 404:
247-
# Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
253+
# Not in Wikipedia — this tells us nothing definitive about
254+
# the word. Return "unknown" so the caller can decide.
248255
return {
249-
"inferred_type": "person",
250-
"confidence": 0.70,
256+
"inferred_type": "unknown",
257+
"confidence": 0.3,
251258
"wiki_summary": None,
252259
"wiki_title": None,
253-
"note": "not found in Wikipedia — likely a proper noun or unusual name",
260+
"note": "not found in Wikipedia",
254261
}
255262
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
256263
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
@@ -502,17 +509,36 @@ def _disambiguate(self, word: str, context: str, person_info: dict) -> Optional[
502509

503510
# ── Research unknown words ───────────────────────────────────────────────
504511

505-
def research(self, word: str, auto_confirm: bool = False) -> dict:
512+
def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict:
506513
"""
507-
Research an unknown word via Wikipedia.
508-
Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
509-
Returns the lookup result.
514+
Research an unknown word.
515+
516+
By default this is **local-only**: it checks the wiki cache and
517+
returns ``"unknown"`` for uncached words. Pass
518+
``allow_network=True`` to explicitly opt in to an outbound
519+
Wikipedia lookup. This design honours the project's
520+
*local-first, zero API* and *privacy by architecture* principles
521+
— no data leaves the machine unless the caller requests it.
522+
523+
Caches result. If *auto_confirm* is ``False``, marks the entry
524+
as unconfirmed (needs user review).
510525
"""
511526
# Already cached?
512527
cache = self._data.setdefault("wiki_cache", {})
513528
if word in cache:
514529
return cache[word]
515530

531+
if not allow_network:
532+
return {
533+
"inferred_type": "unknown",
534+
"confidence": 0.0,
535+
"wiki_summary": None,
536+
"wiki_title": None,
537+
"word": word,
538+
"confirmed": False,
539+
"note": "network lookup disabled — pass allow_network=True to query Wikipedia",
540+
}
541+
516542
result = _wikipedia_lookup(word)
517543
result["word"] = word
518544
result["confirmed"] = auto_confirm

tests/test_entity_registry.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,45 @@ def test_lookup_ambiguous_word_as_concept(tmp_path):
213213
assert result["type"] == "concept"
214214

215215

216-
# ── research (Wikipedia) — mocked ──────────────────────────────────────
216+
# ── research — local-only by default ───────────────────────────────────
217+
218+
219+
def test_research_local_only_by_default(tmp_path):
220+
"""research() must NOT call Wikipedia unless allow_network=True."""
221+
registry = EntityRegistry.load(config_dir=tmp_path)
222+
registry.seed(mode="personal", people=[], projects=[])
223+
224+
with patch(
225+
"mempalace.entity_registry._wikipedia_lookup",
226+
side_effect=AssertionError("network call should not happen"),
227+
):
228+
result = registry.research("Saoirse")
229+
230+
assert result["inferred_type"] == "unknown"
231+
assert result["confidence"] == 0.0
232+
assert result["word"] == "Saoirse"
233+
assert "network lookup disabled" in result.get("note", "")
234+
235+
236+
def test_research_with_allow_network(tmp_path):
237+
"""research(allow_network=True) calls Wikipedia and caches result."""
238+
registry = EntityRegistry.load(config_dir=tmp_path)
239+
registry.seed(mode="personal", people=[], projects=[])
240+
241+
mock_result = {
242+
"inferred_type": "person",
243+
"confidence": 0.80,
244+
"wiki_summary": "Saoirse is an Irish given name.",
245+
"wiki_title": "Saoirse",
246+
}
247+
248+
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
249+
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
250+
assert result["inferred_type"] == "person"
217251

218252

219253
def test_research_caches_result(tmp_path):
254+
"""Once cached via allow_network, subsequent calls use cache without network."""
220255
registry = EntityRegistry.load(config_dir=tmp_path)
221256
registry.seed(mode="personal", people=[], projects=[])
222257

@@ -228,7 +263,7 @@ def test_research_caches_result(tmp_path):
228263
}
229264

230265
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
231-
result = registry.research("Saoirse", auto_confirm=True)
266+
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
232267
assert result["inferred_type"] == "person"
233268

234269
# Second call should use cache, not call Wikipedia again
@@ -240,6 +275,15 @@ def test_research_caches_result(tmp_path):
240275
assert cached["inferred_type"] == "person"
241276

242277

278+
def test_research_local_only_not_cached(tmp_path):
279+
"""Local-only result for uncached word should NOT be persisted to cache."""
280+
registry = EntityRegistry.load(config_dir=tmp_path)
281+
registry.seed(mode="personal", people=[], projects=[])
282+
283+
registry.research("Xander") # local-only, no network
284+
assert "Xander" not in registry._data.get("wiki_cache", {})
285+
286+
243287
def test_confirm_research_adds_to_people(tmp_path):
244288
registry = EntityRegistry.load(config_dir=tmp_path)
245289
registry.seed(mode="personal", people=[], projects=[])
@@ -251,13 +295,32 @@ def test_confirm_research_adds_to_people(tmp_path):
251295
"wiki_title": "Saoirse",
252296
}
253297
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
254-
registry.research("Saoirse", auto_confirm=False)
298+
registry.research("Saoirse", auto_confirm=False, allow_network=True)
255299

256300
registry.confirm_research("Saoirse", entity_type="person", relationship="friend")
257301
assert "Saoirse" in registry.people
258302
assert registry.people["Saoirse"]["source"] == "wiki"
259303

260304

305+
def test_wikipedia_404_returns_unknown(tmp_path):
306+
"""A 404 from Wikipedia should return 'unknown', not assert 'person'."""
307+
registry = EntityRegistry.load(config_dir=tmp_path)
308+
registry.seed(mode="personal", people=[], projects=[])
309+
310+
mock_result = {
311+
"inferred_type": "unknown",
312+
"confidence": 0.3,
313+
"wiki_summary": None,
314+
"wiki_title": None,
315+
"note": "not found in Wikipedia",
316+
}
317+
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
318+
result = registry.research("Zzxqy", auto_confirm=False, allow_network=True)
319+
320+
assert result["inferred_type"] == "unknown"
321+
assert result["confidence"] < 0.5
322+
323+
261324
# ── extract_people_from_query ───────────────────────────────────────────
262325

263326

0 commit comments

Comments
 (0)