Skip to content

Commit 4d33f14

Browse files
authored
Merge pull request #1224 from MemPalace/feat/privacy-warn-external-llm
feat(privacy): warn when LLM tier sends content to external API
2 parents 414aa3e + 4400734 commit 4d33f14

4 files changed

Lines changed: 248 additions & 0 deletions

File tree

mempalace/cli.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,19 @@ def cmd_init(args):
250250
if ok:
251251
llm_provider = candidate
252252
print(f" LLM enabled: {provider_name}/{provider_model}")
253+
# Privacy warning (issue #24): if the configured endpoint
254+
# sends data off the user's machine/network, surface that
255+
# before init proceeds. URL-based — Ollama on localhost,
256+
# LM Studio on LAN, etc. won't trigger; Anthropic /
257+
# cloud OpenAI-compat / any non-local endpoint will.
258+
if candidate.is_external_service:
259+
print(
260+
f" ⚠ {provider_name} is an EXTERNAL API. Your folder "
261+
f"content will be sent to the provider during init. "
262+
f"MemPalace does not control how the provider logs, "
263+
f"retains, or uses your data. Pass --no-llm to keep "
264+
f"init fully local."
265+
)
253266
else:
254267
print(
255268
f" No LLM provider reachable ({msg}). "

mempalace/llm_client.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,65 @@
2828
from dataclasses import dataclass
2929
from typing import Optional
3030
from urllib.error import HTTPError, URLError
31+
from urllib.parse import urlparse
3132
from urllib.request import Request, urlopen
3233

3334

35+
# ── External-service heuristic (issue #24 — privacy warning support) ─────
36+
# Used by ``LLMProvider.is_external_service`` to decide whether the
37+
# provider's configured endpoint will send user content off the local
38+
# machine/network. Single source of truth so all three providers share
39+
# identical "local vs external" semantics.
40+
41+
_LOCALHOST_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"})
42+
43+
44+
def _endpoint_is_local(url: Optional[str]) -> bool:
45+
"""Return True if ``url``'s hostname is on the user's machine or
46+
private network.
47+
48+
Local includes:
49+
- localhost, 127.0.0.1, ::1
50+
- hostnames ending in .local (mDNS/Bonjour)
51+
- IPv4 RFC1918: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
52+
- IPv6 unique-local addresses (fc00::/7) — fc.../fd... prefixes
53+
54+
None / empty / unparseable URLs are treated as local (defensive default —
55+
no endpoint means no external request can happen yet).
56+
57+
Anything else (including public IPs and FQDNs) is external.
58+
"""
59+
if not url:
60+
return True
61+
try:
62+
host = (urlparse(url).hostname or "").lower()
63+
except (ValueError, AttributeError):
64+
return False
65+
if not host:
66+
return True
67+
if host in _LOCALHOST_HOSTS:
68+
return True
69+
if host.endswith(".local"):
70+
return True
71+
if host.startswith("10."):
72+
return True
73+
if host.startswith("192.168."):
74+
return True
75+
if host.startswith("172."):
76+
# 172.16.0.0 - 172.31.255.255
77+
parts = host.split(".")
78+
if len(parts) >= 2:
79+
try:
80+
if 16 <= int(parts[1]) <= 31:
81+
return True
82+
except ValueError:
83+
pass
84+
# IPv6 unique-local addresses fc00::/7 — match leading hex chars
85+
if host.startswith("fc") or host.startswith("fd"):
86+
return True
87+
return False
88+
89+
3490
class LLMError(RuntimeError):
3591
"""Raised for any provider failure — transport, parse, auth, missing model."""
3692

@@ -68,6 +124,20 @@ def check_available(self) -> tuple[bool, str]:
68124
"""Return ``(ok, message)``. Fast probe that the provider is reachable."""
69125
raise NotImplementedError
70126

127+
@property
128+
def is_external_service(self) -> bool:
129+
"""Return True if this provider's endpoint will send user content
130+
off the local machine/network.
131+
132+
Used by ``mempalace init`` to decide whether to print a privacy
133+
warning before first use (issue #24). URL-based heuristic only —
134+
the endpoint determines, regardless of which provider class.
135+
Subclasses that resolve their endpoint dynamically should override
136+
if needed; the default works for the three in-tree providers
137+
(Ollama / OpenAI-compat / Anthropic).
138+
"""
139+
return not _endpoint_is_local(self.endpoint)
140+
71141

72142
def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict:
73143
"""POST JSON and return the parsed response. Raises LLMError on any failure."""

tests/test_corpus_origin_integration.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,3 +1629,115 @@ def test_merge_tier_fields_no_llm_provider_returns_heuristic_only():
16291629
assert res["agent_persona_names"] == []
16301630
assert res["user_name"] is None
16311631
assert res["primary_platform"] is None
1632+
1633+
1634+
# ─────────────────────────────────────────────────────────────────────────
1635+
# External-API privacy warning (issue #24).
1636+
#
1637+
# When mempalace init resolves an LLM provider whose endpoint will send
1638+
# user content off the local machine/network, init MUST print a clear
1639+
# warning naming the provider, stating that MemPalace doesn't control
1640+
# how the provider logs/retains/uses the data, and pointing at --no-llm.
1641+
# Local providers (Ollama on localhost, LM Studio on LAN, etc.) MUST NOT
1642+
# trigger the warning.
1643+
# ─────────────────────────────────────────────────────────────────────────
1644+
1645+
1646+
def test_init_prints_privacy_warning_when_provider_is_external(
1647+
ai_dialogue_corpus: Path, tmp_path: Path, capsys
1648+
):
1649+
"""When cmd_init successfully acquires a provider whose
1650+
is_external_service is True, output must contain the privacy
1651+
warning text including the EXTERNAL marker.
1652+
"""
1653+
from mempalace.cli import cmd_init
1654+
1655+
palace = tmp_path / "palace"
1656+
args = _init_args(ai_dialogue_corpus) # default = LLM ON
1657+
1658+
fake_provider = MagicMock()
1659+
fake_provider.check_available.return_value = (True, "ok")
1660+
fake_provider.is_external_service = True
1661+
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
1662+
1663+
with (
1664+
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
1665+
patch("mempalace.cli.get_provider", return_value=fake_provider),
1666+
patch("mempalace.cli._maybe_run_mine_after_init"),
1667+
patch("mempalace.room_detector_local.detect_rooms_local"),
1668+
):
1669+
cmd_init(args)
1670+
1671+
out = capsys.readouterr().out
1672+
assert "EXTERNAL API" in out, (
1673+
f"Privacy warning must mention 'EXTERNAL API' when provider is external. " f"Got: {out!r}"
1674+
)
1675+
assert (
1676+
"--no-llm" in out
1677+
), f"Privacy warning must point users at --no-llm to opt out. Got: {out!r}"
1678+
# The warning should also tell users MemPalace isn't responsible
1679+
# for downstream provider behavior.
1680+
assert (
1681+
"does not control" in out.lower()
1682+
or "not responsible" in out.lower()
1683+
or "logs" in out.lower()
1684+
or "retains" in out.lower()
1685+
), (
1686+
f"Privacy warning must clarify MemPalace doesn't control how the "
1687+
f"provider handles the data. Got: {out!r}"
1688+
)
1689+
1690+
1691+
def test_init_no_privacy_warning_when_provider_is_local(
1692+
ai_dialogue_corpus: Path, tmp_path: Path, capsys
1693+
):
1694+
"""When cmd_init successfully acquires a LOCAL provider (e.g. Ollama
1695+
on localhost, LM Studio on LAN), the privacy warning MUST NOT fire —
1696+
nothing is leaving the user's machine/network.
1697+
"""
1698+
from mempalace.cli import cmd_init
1699+
1700+
palace = tmp_path / "palace"
1701+
args = _init_args(ai_dialogue_corpus) # default = LLM ON
1702+
1703+
fake_provider = MagicMock()
1704+
fake_provider.check_available.return_value = (True, "ok")
1705+
fake_provider.is_external_service = False # Local provider — no warning
1706+
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
1707+
1708+
with (
1709+
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
1710+
patch("mempalace.cli.get_provider", return_value=fake_provider),
1711+
patch("mempalace.cli._maybe_run_mine_after_init"),
1712+
patch("mempalace.room_detector_local.detect_rooms_local"),
1713+
):
1714+
cmd_init(args)
1715+
1716+
out = capsys.readouterr().out
1717+
assert "EXTERNAL API" not in out, (
1718+
f"Privacy warning fired for a LOCAL provider — should not have. " f"Got: {out!r}"
1719+
)
1720+
1721+
1722+
def test_init_no_privacy_warning_with_no_llm_flag(ai_dialogue_corpus: Path, tmp_path: Path, capsys):
1723+
"""With --no-llm, no provider is acquired at all, so the privacy
1724+
warning has nothing to fire on. Output must not contain it.
1725+
"""
1726+
from mempalace.cli import cmd_init
1727+
1728+
palace = tmp_path / "palace"
1729+
args = _init_args(ai_dialogue_corpus, no_llm=True)
1730+
1731+
with (
1732+
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
1733+
patch("mempalace.cli.get_provider") as mock_get,
1734+
patch("mempalace.cli._maybe_run_mine_after_init"),
1735+
patch("mempalace.room_detector_local.detect_rooms_local"),
1736+
):
1737+
cmd_init(args)
1738+
1739+
mock_get.assert_not_called(), "--no-llm must short-circuit before provider acquisition"
1740+
out = capsys.readouterr().out
1741+
assert (
1742+
"EXTERNAL API" not in out
1743+
), f"Privacy warning fired on --no-llm path — should not have. Got: {out!r}"

tests/test_llm_client.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,3 +325,56 @@ def test_anthropic_no_key_raises_on_classify(monkeypatch):
325325
p = AnthropicProvider(model="claude-haiku")
326326
with pytest.raises(LLMError, match="requires ANTHROPIC_API_KEY"):
327327
p.classify("s", "u")
328+
329+
330+
# ── is_external_service property (issue #24 — privacy warning support) ──
331+
#
332+
# `is_external_service` is True when this provider's endpoint sends data
333+
# off the user's machine/network. Used by mempalace init to print a
334+
# privacy warning before first run when an external API will receive
335+
# folder content. URL-based heuristic: localhost, 127.x, ::1, .local,
336+
# RFC1918 (10/8, 192.168/16, 172.16-31/12), and IPv6 ULA (fc/fd::) are
337+
# all treated as local. Everything else is treated as external.
338+
339+
340+
def test_ollama_provider_default_endpoint_is_local():
341+
"""OllamaProvider's default endpoint is http://localhost:11434, which
342+
must be classified as local — no privacy warning fires for the
343+
typical user running Ollama on their own machine."""
344+
p = OllamaProvider(model="gemma4:e4b")
345+
assert p.is_external_service is False, (
346+
f"Default OllamaProvider endpoint must be local; got "
347+
f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
348+
)
349+
350+
351+
def test_openai_compat_provider_localhost_endpoint_is_local():
352+
"""LM Studio / llama.cpp server / vLLM commonly bind to localhost.
353+
Those setups must NOT trigger the external-API warning."""
354+
p = OpenAICompatProvider(model="any", endpoint="http://localhost:1234")
355+
assert p.is_external_service is False
356+
p_127 = OpenAICompatProvider(model="any", endpoint="http://127.0.0.1:8000")
357+
assert p_127.is_external_service is False
358+
p_lan = OpenAICompatProvider(model="any", endpoint="http://192.168.1.50:1234")
359+
assert p_lan.is_external_service is False, "LAN (RFC1918) endpoints must be local"
360+
361+
362+
def test_openai_compat_provider_cloud_endpoint_is_external():
363+
"""A user pointing openai-compat at OpenAI's hosted API or any other
364+
non-local endpoint MUST trigger the external warning."""
365+
p = OpenAICompatProvider(model="gpt-4o", endpoint="https://api.openai.com")
366+
assert p.is_external_service is True, (
367+
f"https://api.openai.com must be classified external; got "
368+
f"is_external_service={p.is_external_service}"
369+
)
370+
371+
372+
def test_anthropic_provider_default_endpoint_is_external():
373+
"""AnthropicProvider's default endpoint is https://api.anthropic.com,
374+
which is always external by definition. The privacy warning MUST
375+
fire by default for users who pass --llm-provider anthropic."""
376+
p = AnthropicProvider(model="claude-haiku-4-5", api_key="sk-test")
377+
assert p.is_external_service is True, (
378+
f"Default AnthropicProvider endpoint must be external; got "
379+
f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
380+
)

0 commit comments

Comments
 (0)