diff --git a/mempalace/cli.py b/mempalace/cli.py index b9776d8..b6e48df 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -250,6 +250,19 @@ def cmd_init(args): if ok: llm_provider = candidate print(f" LLM enabled: {provider_name}/{provider_model}") + # Privacy warning (issue #24): if the configured endpoint + # sends data off the user's machine/network, surface that + # before init proceeds. URL-based — Ollama on localhost, + # LM Studio on LAN, etc. won't trigger; Anthropic / + # cloud OpenAI-compat / any non-local endpoint will. + if candidate.is_external_service: + print( + f" ⚠ {provider_name} is an EXTERNAL API. Your folder " + f"content will be sent to the provider during init. " + f"MemPalace does not control how the provider logs, " + f"retains, or uses your data. Pass --no-llm to keep " + f"init fully local." + ) else: print( f" No LLM provider reachable ({msg}). " diff --git a/mempalace/llm_client.py b/mempalace/llm_client.py index 74982ce..837247b 100644 --- a/mempalace/llm_client.py +++ b/mempalace/llm_client.py @@ -28,9 +28,65 @@ import os from dataclasses import dataclass from typing import Optional from urllib.error import HTTPError, URLError +from urllib.parse import urlparse from urllib.request import Request, urlopen +# ── External-service heuristic (issue #24 — privacy warning support) ───── +# Used by ``LLMProvider.is_external_service`` to decide whether the +# provider's configured endpoint will send user content off the local +# machine/network. Single source of truth so all three providers share +# identical "local vs external" semantics. + +_LOCALHOST_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"}) + + +def _endpoint_is_local(url: Optional[str]) -> bool: + """Return True if ``url``'s hostname is on the user's machine or + private network. + + Local includes: + - localhost, 127.0.0.1, ::1 + - hostnames ending in .local (mDNS/Bonjour) + - IPv4 RFC1918: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 + - IPv6 unique-local addresses (fc00::/7) — fc.../fd... prefixes + + None / empty / unparseable URLs are treated as local (defensive default — + no endpoint means no external request can happen yet). + + Anything else (including public IPs and FQDNs) is external. + """ + if not url: + return True + try: + host = (urlparse(url).hostname or "").lower() + except (ValueError, AttributeError): + return False + if not host: + return True + if host in _LOCALHOST_HOSTS: + return True + if host.endswith(".local"): + return True + if host.startswith("10."): + return True + if host.startswith("192.168."): + return True + if host.startswith("172."): + # 172.16.0.0 - 172.31.255.255 + parts = host.split(".") + if len(parts) >= 2: + try: + if 16 <= int(parts[1]) <= 31: + return True + except ValueError: + pass + # IPv6 unique-local addresses fc00::/7 — match leading hex chars + if host.startswith("fc") or host.startswith("fd"): + return True + return False + + class LLMError(RuntimeError): """Raised for any provider failure — transport, parse, auth, missing model.""" @@ -68,6 +124,20 @@ class LLMProvider: """Return ``(ok, message)``. Fast probe that the provider is reachable.""" raise NotImplementedError + @property + def is_external_service(self) -> bool: + """Return True if this provider's endpoint will send user content + off the local machine/network. + + Used by ``mempalace init`` to decide whether to print a privacy + warning before first use (issue #24). URL-based heuristic only — + the endpoint determines, regardless of which provider class. + Subclasses that resolve their endpoint dynamically should override + if needed; the default works for the three in-tree providers + (Ollama / OpenAI-compat / Anthropic). + """ + return not _endpoint_is_local(self.endpoint) + def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict: """POST JSON and return the parsed response. Raises LLMError on any failure.""" diff --git a/tests/test_corpus_origin_integration.py b/tests/test_corpus_origin_integration.py index 1ef8996..b896f28 100644 --- a/tests/test_corpus_origin_integration.py +++ b/tests/test_corpus_origin_integration.py @@ -1629,3 +1629,115 @@ def test_merge_tier_fields_no_llm_provider_returns_heuristic_only(): assert res["agent_persona_names"] == [] assert res["user_name"] is None assert res["primary_platform"] is None + + +# ───────────────────────────────────────────────────────────────────────── +# External-API privacy warning (issue #24). +# +# When mempalace init resolves an LLM provider whose endpoint will send +# user content off the local machine/network, init MUST print a clear +# warning naming the provider, stating that MemPalace doesn't control +# how the provider logs/retains/uses the data, and pointing at --no-llm. +# Local providers (Ollama on localhost, LM Studio on LAN, etc.) MUST NOT +# trigger the warning. +# ───────────────────────────────────────────────────────────────────────── + + +def test_init_prints_privacy_warning_when_provider_is_external( + ai_dialogue_corpus: Path, tmp_path: Path, capsys +): + """When cmd_init successfully acquires a provider whose + is_external_service is True, output must contain the privacy + warning text including the EXTERNAL marker. + """ + from mempalace.cli import cmd_init + + palace = tmp_path / "palace" + args = _init_args(ai_dialogue_corpus) # default = LLM ON + + fake_provider = MagicMock() + fake_provider.check_available.return_value = (True, "ok") + fake_provider.is_external_service = True + fake_provider.classify.return_value = MagicMock(text='{"classifications": []}') + + with ( + patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)), + patch("mempalace.cli.get_provider", return_value=fake_provider), + patch("mempalace.cli._maybe_run_mine_after_init"), + patch("mempalace.room_detector_local.detect_rooms_local"), + ): + cmd_init(args) + + out = capsys.readouterr().out + assert "EXTERNAL API" in out, ( + f"Privacy warning must mention 'EXTERNAL API' when provider is external. " f"Got: {out!r}" + ) + assert ( + "--no-llm" in out + ), f"Privacy warning must point users at --no-llm to opt out. Got: {out!r}" + # The warning should also tell users MemPalace isn't responsible + # for downstream provider behavior. + assert ( + "does not control" in out.lower() + or "not responsible" in out.lower() + or "logs" in out.lower() + or "retains" in out.lower() + ), ( + f"Privacy warning must clarify MemPalace doesn't control how the " + f"provider handles the data. Got: {out!r}" + ) + + +def test_init_no_privacy_warning_when_provider_is_local( + ai_dialogue_corpus: Path, tmp_path: Path, capsys +): + """When cmd_init successfully acquires a LOCAL provider (e.g. Ollama + on localhost, LM Studio on LAN), the privacy warning MUST NOT fire — + nothing is leaving the user's machine/network. + """ + from mempalace.cli import cmd_init + + palace = tmp_path / "palace" + args = _init_args(ai_dialogue_corpus) # default = LLM ON + + fake_provider = MagicMock() + fake_provider.check_available.return_value = (True, "ok") + fake_provider.is_external_service = False # Local provider — no warning + fake_provider.classify.return_value = MagicMock(text='{"classifications": []}') + + with ( + patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)), + patch("mempalace.cli.get_provider", return_value=fake_provider), + patch("mempalace.cli._maybe_run_mine_after_init"), + patch("mempalace.room_detector_local.detect_rooms_local"), + ): + cmd_init(args) + + out = capsys.readouterr().out + assert "EXTERNAL API" not in out, ( + f"Privacy warning fired for a LOCAL provider — should not have. " f"Got: {out!r}" + ) + + +def test_init_no_privacy_warning_with_no_llm_flag(ai_dialogue_corpus: Path, tmp_path: Path, capsys): + """With --no-llm, no provider is acquired at all, so the privacy + warning has nothing to fire on. Output must not contain it. + """ + from mempalace.cli import cmd_init + + palace = tmp_path / "palace" + args = _init_args(ai_dialogue_corpus, no_llm=True) + + with ( + patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)), + patch("mempalace.cli.get_provider") as mock_get, + patch("mempalace.cli._maybe_run_mine_after_init"), + patch("mempalace.room_detector_local.detect_rooms_local"), + ): + cmd_init(args) + + mock_get.assert_not_called(), "--no-llm must short-circuit before provider acquisition" + out = capsys.readouterr().out + assert ( + "EXTERNAL API" not in out + ), f"Privacy warning fired on --no-llm path — should not have. Got: {out!r}" diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py index 184d100..d9dd6e9 100644 --- a/tests/test_llm_client.py +++ b/tests/test_llm_client.py @@ -325,3 +325,56 @@ def test_anthropic_no_key_raises_on_classify(monkeypatch): p = AnthropicProvider(model="claude-haiku") with pytest.raises(LLMError, match="requires ANTHROPIC_API_KEY"): p.classify("s", "u") + + +# ── is_external_service property (issue #24 — privacy warning support) ── +# +# `is_external_service` is True when this provider's endpoint sends data +# off the user's machine/network. Used by mempalace init to print a +# privacy warning before first run when an external API will receive +# folder content. URL-based heuristic: localhost, 127.x, ::1, .local, +# RFC1918 (10/8, 192.168/16, 172.16-31/12), and IPv6 ULA (fc/fd::) are +# all treated as local. Everything else is treated as external. + + +def test_ollama_provider_default_endpoint_is_local(): + """OllamaProvider's default endpoint is http://localhost:11434, which + must be classified as local — no privacy warning fires for the + typical user running Ollama on their own machine.""" + p = OllamaProvider(model="gemma4:e4b") + assert p.is_external_service is False, ( + f"Default OllamaProvider endpoint must be local; got " + f"is_external_service={p.is_external_service} for endpoint={p.endpoint}" + ) + + +def test_openai_compat_provider_localhost_endpoint_is_local(): + """LM Studio / llama.cpp server / vLLM commonly bind to localhost. + Those setups must NOT trigger the external-API warning.""" + p = OpenAICompatProvider(model="any", endpoint="http://localhost:1234") + assert p.is_external_service is False + p_127 = OpenAICompatProvider(model="any", endpoint="http://127.0.0.1:8000") + assert p_127.is_external_service is False + p_lan = OpenAICompatProvider(model="any", endpoint="http://192.168.1.50:1234") + assert p_lan.is_external_service is False, "LAN (RFC1918) endpoints must be local" + + +def test_openai_compat_provider_cloud_endpoint_is_external(): + """A user pointing openai-compat at OpenAI's hosted API or any other + non-local endpoint MUST trigger the external warning.""" + p = OpenAICompatProvider(model="gpt-4o", endpoint="https://api.openai.com") + assert p.is_external_service is True, ( + f"https://api.openai.com must be classified external; got " + f"is_external_service={p.is_external_service}" + ) + + +def test_anthropic_provider_default_endpoint_is_external(): + """AnthropicProvider's default endpoint is https://api.anthropic.com, + which is always external by definition. The privacy warning MUST + fire by default for users who pass --llm-provider anthropic.""" + p = AnthropicProvider(model="claude-haiku-4-5", api_key="sk-test") + assert p.is_external_service is True, ( + f"Default AnthropicProvider endpoint must be external; got " + f"is_external_service={p.is_external_service} for endpoint={p.endpoint}" + )