Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 40 additions & 12 deletions mempalace/entity_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ def _wikipedia_lookup(word: str) -> dict:
Look up a word via Wikipedia REST API.
Returns inferred type (person/place/concept/unknown) + confidence + summary.
Free, no API key, handles disambiguation pages.

**Privacy warning:** This function makes an outbound HTTPS request to
en.wikipedia.org, sending the queried word over the network. It should
only be called when the caller has explicitly opted in via
``allow_network=True`` in :meth:`EntityRegistry.research`. The default
behaviour of ``research()`` is local-only (no network calls).
"""
try:
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
Expand Down Expand Up @@ -244,13 +250,14 @@ def _wikipedia_lookup(word: str) -> dict:

except urllib.error.HTTPError as e:
if e.code == 404:
# Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
# Not in Wikipedia — this tells us nothing definitive about
# the word. Return "unknown" so the caller can decide.
return {
"inferred_type": "person",
"confidence": 0.70,
"inferred_type": "unknown",
"confidence": 0.3,
"wiki_summary": None,
"wiki_title": None,
"note": "not found in Wikipedia — likely a proper noun or unusual name",
"note": "not found in Wikipedia",
}
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
Expand Down Expand Up @@ -502,20 +509,41 @@ def _disambiguate(self, word: str, context: str, person_info: dict) -> Optional[

# ── Research unknown words ───────────────────────────────────────────────

def research(self, word: str, auto_confirm: bool = False) -> dict:
def research(self, word: str, auto_confirm: bool = False, allow_network: bool = False) -> dict:
"""
Research an unknown word via Wikipedia.
Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
Returns the lookup result.
Research an unknown word.

By default this is **local-only**: it checks the wiki cache and
returns ``"unknown"`` for uncached words. Pass
``allow_network=True`` to explicitly opt in to an outbound
Wikipedia lookup. This design honours the project's
*local-first, zero API* and *privacy by architecture* principles
— no data leaves the machine unless the caller requests it.

Caches result. If *auto_confirm* is ``False``, marks the entry
as unconfirmed (needs user review).
"""
# Already cached?
cache = self._data.setdefault("wiki_cache", {})
# Check cache (read-only — no mutation when allow_network is False)
cache = self._data.get("wiki_cache", {})
if word in cache:
return cache[word]

if not allow_network:
return {
"inferred_type": "unknown",
"confidence": 0.0,
"wiki_summary": None,
"wiki_title": None,
"word": word,
"confirmed": False,
"note": "network lookup disabled — pass allow_network=True to query Wikipedia",
}

# Network path — ensure wiki_cache key exists before writing
cache = self._data.setdefault("wiki_cache", {})
result = _wikipedia_lookup(word)
result["word"] = word
result["confirmed"] = auto_confirm
result.setdefault("word", word)
result.setdefault("confirmed", auto_confirm)

cache[word] = result
self.save()
Expand Down
96 changes: 78 additions & 18 deletions tests/test_entity_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
EntityRegistry,
)

# Shared mock result for Wikipedia person lookup tests
_MOCK_SAOIRSE_PERSON = {
"inferred_type": "person",
"confidence": 0.80,
"wiki_summary": "Saoirse is an Irish given name.",
"wiki_title": "Saoirse",
}


# ── COMMON_ENGLISH_WORDS ────────────────────────────────────────────────

Expand Down Expand Up @@ -213,22 +221,49 @@ def test_lookup_ambiguous_word_as_concept(tmp_path):
assert result["type"] == "concept"


# ── research (Wikipedia) — mocked ──────────────────────────────────────
# ── research — local-only by default ───────────────────────────────────


def test_research_caches_result(tmp_path):
def test_research_local_only_by_default(tmp_path):
"""research() must NOT call Wikipedia unless allow_network=True."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

mock_result = {
"inferred_type": "person",
"confidence": 0.80,
"wiki_summary": "Saoirse is an Irish given name.",
"wiki_title": "Saoirse",
}
with patch(
"mempalace.entity_registry._wikipedia_lookup",
side_effect=AssertionError("network call should not happen"),
):
result = registry.research("Saoirse")

with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
result = registry.research("Saoirse", auto_confirm=True)
assert result["inferred_type"] == "unknown"
assert result["confidence"] == 0.0
assert result["word"] == "Saoirse"
assert "network lookup disabled" in result.get("note", "")


def test_research_with_allow_network(tmp_path):
"""research(allow_network=True) calls Wikipedia and caches result."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
assert result["inferred_type"] == "person"


def test_research_caches_result(tmp_path):
"""Once cached via allow_network, subsequent calls use cache without network."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
result = registry.research("Saoirse", auto_confirm=True, allow_network=True)
assert result["inferred_type"] == "person"

# Second call should use cache, not call Wikipedia again
Expand All @@ -240,24 +275,49 @@ def test_research_caches_result(tmp_path):
assert cached["inferred_type"] == "person"


def test_research_local_only_not_cached(tmp_path):
"""Local-only result for uncached word should NOT be persisted to cache."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

registry.research("Xander") # local-only, no network
assert "Xander" not in registry._data.get("wiki_cache", {})


def test_confirm_research_adds_to_people(tmp_path):
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

mock_result = {
"inferred_type": "person",
"confidence": 0.80,
"wiki_summary": "Saoirse is a name",
"wiki_title": "Saoirse",
}
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
registry.research("Saoirse", auto_confirm=False)
with patch(
"mempalace.entity_registry._wikipedia_lookup",
return_value=dict(_MOCK_SAOIRSE_PERSON),
):
registry.research("Saoirse", auto_confirm=False, allow_network=True)

registry.confirm_research("Saoirse", entity_type="person", relationship="friend")
assert "Saoirse" in registry.people
assert registry.people["Saoirse"]["source"] == "wiki"


def test_wikipedia_404_returns_unknown(tmp_path):
"""A 404 from Wikipedia should return 'unknown', not assert 'person'."""
registry = EntityRegistry.load(config_dir=tmp_path)
registry.seed(mode="personal", people=[], projects=[])

mock_result = {
"inferred_type": "unknown",
"confidence": 0.3,
"wiki_summary": None,
"wiki_title": None,
"note": "not found in Wikipedia",
}
with patch("mempalace.entity_registry._wikipedia_lookup", return_value=mock_result):
result = registry.research("Zzxqy", auto_confirm=False, allow_network=True)

assert result["inferred_type"] == "unknown"
assert result["confidence"] < 0.5


# ── extract_people_from_query ───────────────────────────────────────────


Expand Down