feat(privacy): warn when LLM tier sends content to external API
4 files changed, 248 insertions, 0 deletions. 7 new tests (4 unit + 3 integration), all RED-first. Per @milla-jovovich's question to @igorls during PR #1221 review: users running `mempalace init` with an external LLM provider (Anthropic API, OpenAI hosted, etc.) need a clear, explicit warning that their folder content will be sent to the provider, that MemPalace doesn't control how the provider logs/retains/uses that data, and how to opt out. @igorls confirmed this should be a small follow-up PR scoped to the warning itself, before the v3.3.4 tag. This PR adds: - `_endpoint_is_local(url)` helper in `mempalace/llm_client.py` — URL-based heuristic returning True if the hostname is on the user's machine or private network. Covers: localhost, 127.0.0.1, ::1, hostnames ending in .local (mDNS/Bonjour), IPv4 RFC1918 ranges (10/8, 172.16-31/12, 192.168/16), and IPv6 unique-local addresses (fc00::/7). - `is_external_service` property on the `LLMProvider` base class. Subclasses inherit; the URL determines (no provider-specific hardcoding). This means: Ollama on localhost = local. LM Studio on LAN = local. Anthropic with default `https://api.anthropic.com` = external. A user proxying Anthropic through localhost (advanced setup) = local, no false-positive warning. - One-line warning print in `cmd_init` after successful provider acquisition, gated on `is_external_service`: ⚠ {provider_name} is an EXTERNAL API. Your folder content will be sent to the provider during init. MemPalace does not control how the provider logs, retains, or uses your data. Pass --no-llm to keep init fully local. The warning fires AFTER `LLM enabled: ...` so users see both that the LLM is engaged AND the privacy implications of where it lives, before Pass 0 / entity detection actually runs. LOCAL providers (Ollama on localhost, LM Studio on localhost or LAN, llama.cpp on localhost, vLLM on localhost) DO NOT trigger the warning — nothing leaves the user's machine/network in those configurations. TDD: 7 tests added across 2 files. Unit tests in `tests/test_llm_client.py` (4 tests, all RED-first): 1. test_ollama_provider_default_endpoint_is_local — pins that the default `http://localhost:11434` is classified local. 2. test_openai_compat_provider_localhost_endpoint_is_local — covers the LM Studio / llama.cpp / vLLM common case (localhost, 127.0.0.1, and 192.168.x LAN). 3. test_openai_compat_provider_cloud_endpoint_is_external — pins that pointing openai-compat at https://api.openai.com (or any non-local URL) classifies as external. 4. test_anthropic_provider_default_endpoint_is_external — pins that AnthropicProvider's default endpoint is external (the dominant user-facing case for `--llm-provider anthropic`). Integration tests in `tests/test_corpus_origin_integration.py` (3 tests, RED-first; 1 was the critical RED — the other 2 passed by accident since nothing printed "EXTERNAL API" before this PR): 5. test_init_prints_privacy_warning_when_provider_is_external — captures stdout from cmd_init with a mocked external provider, asserts the warning text contains "EXTERNAL API" + "--no-llm" + language about MemPalace not controlling provider behavior. 6. test_init_no_privacy_warning_when_provider_is_local — same flow with a mocked local provider, asserts the warning text does NOT appear. 7. test_init_no_privacy_warning_with_no_llm_flag — pins the --no-llm path: no provider acquisition attempted, no warning fires. Tests: 1382 total mempalace tests pass. 2 pre-existing environmental failures unrelated to this change (chromadb optional dep). Ruff check + format both clean. Backwards compatible: `is_external_service` is a new property; existing callers don't reference it. The warning is a new print statement that fires only when an external endpoint is acquired. The `--no-llm` opt-out existed before this PR and continues to work identically. Out of scope for follow-up (deliberately not in this PR per Igor's "small PR" guidance): Tailscale CGNAT (100.64.0.0/10) treatment, pre-init confirmation prompt, persistent privacy-mode config flag, explicit cloud-provider name detection. Tracked for future iteration.
This commit is contained in:
@@ -1629,3 +1629,115 @@ def test_merge_tier_fields_no_llm_provider_returns_heuristic_only():
|
||||
assert res["agent_persona_names"] == []
|
||||
assert res["user_name"] is None
|
||||
assert res["primary_platform"] is None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# External-API privacy warning (issue #24).
|
||||
#
|
||||
# When mempalace init resolves an LLM provider whose endpoint will send
|
||||
# user content off the local machine/network, init MUST print a clear
|
||||
# warning naming the provider, stating that MemPalace doesn't control
|
||||
# how the provider logs/retains/uses the data, and pointing at --no-llm.
|
||||
# Local providers (Ollama on localhost, LM Studio on LAN, etc.) MUST NOT
|
||||
# trigger the warning.
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_init_prints_privacy_warning_when_provider_is_external(
|
||||
ai_dialogue_corpus: Path, tmp_path: Path, capsys
|
||||
):
|
||||
"""When cmd_init successfully acquires a provider whose
|
||||
is_external_service is True, output must contain the privacy
|
||||
warning text including the EXTERNAL marker.
|
||||
"""
|
||||
from mempalace.cli import cmd_init
|
||||
|
||||
palace = tmp_path / "palace"
|
||||
args = _init_args(ai_dialogue_corpus) # default = LLM ON
|
||||
|
||||
fake_provider = MagicMock()
|
||||
fake_provider.check_available.return_value = (True, "ok")
|
||||
fake_provider.is_external_service = True
|
||||
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
|
||||
|
||||
with (
|
||||
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
|
||||
patch("mempalace.cli.get_provider", return_value=fake_provider),
|
||||
patch("mempalace.cli._maybe_run_mine_after_init"),
|
||||
patch("mempalace.room_detector_local.detect_rooms_local"),
|
||||
):
|
||||
cmd_init(args)
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "EXTERNAL API" in out, (
|
||||
f"Privacy warning must mention 'EXTERNAL API' when provider is external. " f"Got: {out!r}"
|
||||
)
|
||||
assert (
|
||||
"--no-llm" in out
|
||||
), f"Privacy warning must point users at --no-llm to opt out. Got: {out!r}"
|
||||
# The warning should also tell users MemPalace isn't responsible
|
||||
# for downstream provider behavior.
|
||||
assert (
|
||||
"does not control" in out.lower()
|
||||
or "not responsible" in out.lower()
|
||||
or "logs" in out.lower()
|
||||
or "retains" in out.lower()
|
||||
), (
|
||||
f"Privacy warning must clarify MemPalace doesn't control how the "
|
||||
f"provider handles the data. Got: {out!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_init_no_privacy_warning_when_provider_is_local(
|
||||
ai_dialogue_corpus: Path, tmp_path: Path, capsys
|
||||
):
|
||||
"""When cmd_init successfully acquires a LOCAL provider (e.g. Ollama
|
||||
on localhost, LM Studio on LAN), the privacy warning MUST NOT fire —
|
||||
nothing is leaving the user's machine/network.
|
||||
"""
|
||||
from mempalace.cli import cmd_init
|
||||
|
||||
palace = tmp_path / "palace"
|
||||
args = _init_args(ai_dialogue_corpus) # default = LLM ON
|
||||
|
||||
fake_provider = MagicMock()
|
||||
fake_provider.check_available.return_value = (True, "ok")
|
||||
fake_provider.is_external_service = False # Local provider — no warning
|
||||
fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
|
||||
|
||||
with (
|
||||
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
|
||||
patch("mempalace.cli.get_provider", return_value=fake_provider),
|
||||
patch("mempalace.cli._maybe_run_mine_after_init"),
|
||||
patch("mempalace.room_detector_local.detect_rooms_local"),
|
||||
):
|
||||
cmd_init(args)
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "EXTERNAL API" not in out, (
|
||||
f"Privacy warning fired for a LOCAL provider — should not have. " f"Got: {out!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_init_no_privacy_warning_with_no_llm_flag(ai_dialogue_corpus: Path, tmp_path: Path, capsys):
|
||||
"""With --no-llm, no provider is acquired at all, so the privacy
|
||||
warning has nothing to fire on. Output must not contain it.
|
||||
"""
|
||||
from mempalace.cli import cmd_init
|
||||
|
||||
palace = tmp_path / "palace"
|
||||
args = _init_args(ai_dialogue_corpus, no_llm=True)
|
||||
|
||||
with (
|
||||
patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
|
||||
patch("mempalace.cli.get_provider") as mock_get,
|
||||
patch("mempalace.cli._maybe_run_mine_after_init"),
|
||||
patch("mempalace.room_detector_local.detect_rooms_local"),
|
||||
):
|
||||
cmd_init(args)
|
||||
|
||||
mock_get.assert_not_called(), "--no-llm must short-circuit before provider acquisition"
|
||||
out = capsys.readouterr().out
|
||||
assert (
|
||||
"EXTERNAL API" not in out
|
||||
), f"Privacy warning fired on --no-llm path — should not have. Got: {out!r}"
|
||||
|
||||
Reference in New Issue
Block a user