feat(llm): interactive entity refinement with batching and cancellation

Takes the candidate set produced by phase-1 detection (manifests, git authors, regex on prose) and asks an LLM to reclassify each candidate as PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS. Scale approach: never feed the raw corpus to the LLM. For each candidate, collect up to 3 context lines from sampled prose, cap each at 240 chars, batch 25 candidates per call. Keeps total input around 50-100K tokens even on large corpora and completes in a few minutes on a 4B local model. Interactive UX: - Stderr progress bar with the current candidate name, updates per-batch. - Ctrl-C interrupts cleanly: returns a RefineResult with `cancelled=True` and whatever was classified before the interrupt. The partial result is safe to pass straight to confirm_entities. - Per-batch errors (transport, parse) are recorded in `errors` and don't abort the whole run. Refinement scope: only `uncertain` and low-confidence `projects` entries are sent. Manifest-backed projects (conf >= 0.95) and git- authored people are already authoritative and skip the LLM. Response parser is defensive — accepts `label` or `type` keys, lowercase/uppercase variants, top-level list or wrapped object, and strips markdown code fences. Unknown labels become AMBIGUOUS so the user reviews them rather than silently accepting a bad classification. `collect_corpus_text` provides a simple stratified prose sampler (recent first, capped per-file) so callers don't need to build their own corpus window. 28 tests with a FakeProvider (no network). Covers context collection, prompt building, response parsing variants, classification apply, end-to-end refine, and Ctrl-C partial-result behavior.
2026-04-24 00:46:59 -03:00
parent df6c7d0dc3
commit 10a743d5d8
2 changed files with 814 additions and 0 deletions
@@ -0,0 +1,446 @@
+"""Tests for mempalace.llm_refine.
+
+Uses a fake provider for deterministic, offline tests. No network.
+"""
+
+from dataclasses import dataclass
+
+
+from mempalace.llm_client import LLMError, LLMResponse
+from mempalace.llm_refine import (
+    _apply_classifications,
+    _build_user_prompt,
+    _collect_contexts,
+    _parse_response,
+    collect_corpus_text,
+    refine_entities,
+)
+
+
+# ── fake provider ───────────────────────────────────────────────────────
+
+
+@dataclass
+class FakeProvider:
+    """Returns a caller-supplied JSON string on every classify call."""
+
+    response_text: str = ""
+    should_raise: Exception = None
+    call_count: int = 0
+    interrupt_on_call: int = -1
+
+    def classify(self, system, user, json_mode=True):
+        self.call_count += 1
+        if self.call_count == self.interrupt_on_call:
+            raise KeyboardInterrupt()
+        if self.should_raise is not None:
+            raise self.should_raise
+        return LLMResponse(text=self.response_text, model="fake", provider="fake", raw={})
+
+    def check_available(self):
+        return True, "ok"
+
+
+# ── _collect_contexts ───────────────────────────────────────────────────
+
+
+def test_collect_contexts_finds_matches():
+    lines = [
+        "Something about Alice",
+        "Bob said hello",
+        "Alice was here",
+        "Alice walked by",
+    ]
+    out = _collect_contexts(lines, "Alice", max_lines=2)
+    assert len(out) == 2
+    assert all("alice" in line.lower() for line in out)
+
+
+def test_collect_contexts_case_insensitive():
+    lines = ["lowercase alice mention"]
+    out = _collect_contexts(lines, "Alice")
+    assert out == ["lowercase alice mention"]
+
+
+def test_collect_contexts_dedupes_identical_lines():
+    lines = ["Alice", "Alice", "Alice was here"]
+    out = _collect_contexts(lines, "Alice", max_lines=5)
+    # two unique lines, not three
+    assert len(out) == 2
+
+
+def test_collect_contexts_truncates_long_lines():
+    lines = ["Alice " + ("x" * 1000)]
+    out = _collect_contexts(lines, "Alice")
+    assert len(out[0]) <= 240
+
+
+def test_collect_contexts_no_matches():
+    assert _collect_contexts(["nothing here"], "Alice") == []
+
+
+# ── _build_user_prompt ──────────────────────────────────────────────────
+
+
+def test_build_user_prompt_numbers_and_includes_contexts():
+    prompt = _build_user_prompt(
+        [
+            ("Alice", "uncertain", ["Alice said hi"]),
+            ("Bob", "project", []),
+        ]
+    )
+    assert "1. Alice" in prompt
+    assert "2. Bob" in prompt
+    assert "Alice said hi" in prompt
+    assert "(no context available)" in prompt
+
+
+# ── _parse_response ─────────────────────────────────────────────────────
+
+
+def test_parse_response_canonicalizes_label():
+    text = '{"classifications": [{"name": "Alice", "label": "person", "reason": "x"}]}'
+    out = _parse_response(text, ["Alice"])
+    assert out["Alice"] == ("PERSON", "x")
+
+
+def test_parse_response_accepts_type_alias():
+    """LLMs may return 'type' instead of 'label'."""
+    text = '{"classifications": [{"name": "Bob", "type": "PROJECT"}]}'
+    out = _parse_response(text, ["Bob"])
+    assert out["Bob"][0] == "PROJECT"
+
+
+def test_parse_response_maps_unknown_label_to_ambiguous():
+    text = '{"classifications": [{"name": "X", "label": "WEIRD"}]}'
+    out = _parse_response(text, ["X"])
+    assert out["X"][0] == "AMBIGUOUS"
+
+
+def test_parse_response_restores_canonical_casing():
+    """Model may lowercase the name; we restore against the expected set."""
+    text = '{"classifications": [{"name": "mempalace", "label": "PROJECT"}]}'
+    out = _parse_response(text, ["MemPalace"])
+    assert "MemPalace" in out
+    assert out["MemPalace"][0] == "PROJECT"
+
+
+def test_parse_response_strips_code_fences():
+    text = '```json\n{"classifications": [{"name": "X", "label": "TOPIC"}]}\n```'
+    out = _parse_response(text, ["X"])
+    assert out["X"][0] == "TOPIC"
+
+
+def test_parse_response_malformed_returns_empty():
+    out = _parse_response("not json at all", ["X"])
+    assert out == {}
+
+
+def test_parse_response_accepts_top_level_list():
+    """Some models skip the wrapping object and return the list directly."""
+    text = '[{"name": "Y", "label": "PERSON"}]'
+    out = _parse_response(text, ["Y"])
+    assert out["Y"][0] == "PERSON"
+
+
+# ── _apply_classifications ──────────────────────────────────────────────
+
+
+def test_apply_classifications_moves_to_correct_bucket():
+    detected = {
+        "people": [],
+        "projects": [
+            {
+                "name": "Foo",
+                "type": "project",
+                "confidence": 0.8,
+                "frequency": 3,
+                "signals": ["old"],
+            }
+        ],
+        "uncertain": [
+            {"name": "Alice", "type": "uncertain", "confidence": 0.4, "frequency": 5, "signals": []}
+        ],
+    }
+    decisions = {
+        "Foo": ("PROJECT", "real project name"),
+        "Alice": ("PERSON", "clearly a person"),
+    }
+    new, reclass, dropped = _apply_classifications(detected, decisions)
+    assert len(new["people"]) == 1
+    assert new["people"][0]["name"] == "Alice"
+    assert new["people"][0]["type"] == "person"
+    assert reclass == 1  # Alice moved uncertain -> people
+    assert dropped == 0
+
+
+def test_apply_classifications_drops_common_word():
+    detected = {
+        "people": [],
+        "projects": [],
+        "uncertain": [
+            {
+                "name": "Never",
+                "type": "uncertain",
+                "confidence": 0.4,
+                "frequency": 20,
+                "signals": [],
+            }
+        ],
+    }
+    decisions = {"Never": ("COMMON_WORD", "adverb")}
+    new, _, dropped = _apply_classifications(detected, decisions)
+    assert dropped == 1
+    assert new["uncertain"] == []
+
+
+def test_apply_classifications_keeps_unvisited_entries():
+    detected = {
+        "people": [
+            {
+                "name": "Igor",
+                "type": "person",
+                "confidence": 0.99,
+                "frequency": 100,
+                "signals": ["git"],
+            }
+        ],
+        "projects": [],
+        "uncertain": [],
+    }
+    # No decision for Igor — should stay untouched
+    new, reclass, dropped = _apply_classifications(detected, {})
+    assert new["people"][0]["name"] == "Igor"
+    assert reclass == 0
+    assert dropped == 0
+
+
+def test_apply_classifications_appends_reason_signal():
+    detected = {
+        "people": [],
+        "projects": [],
+        "uncertain": [
+            {
+                "name": "Foo",
+                "type": "uncertain",
+                "confidence": 0.4,
+                "frequency": 5,
+                "signals": ["regex"],
+            }
+        ],
+    }
+    decisions = {"Foo": ("PERSON", "spoken of by name")}
+    new, _, _ = _apply_classifications(detected, decisions)
+    assert any("LLM: person" in s for s in new["people"][0]["signals"])
+    assert any("spoken of by name" in s for s in new["people"][0]["signals"])
+
+
+def test_apply_classifications_topic_goes_to_uncertain():
+    detected = {
+        "people": [],
+        "projects": [
+            {
+                "name": "Paris",
+                "type": "project",
+                "confidence": 0.7,
+                "frequency": 5,
+                "signals": ["regex"],
+            }
+        ],
+        "uncertain": [],
+    }
+    decisions = {"Paris": ("TOPIC", "city, not a project")}
+    new, reclass, _ = _apply_classifications(detected, decisions)
+    assert len(new["projects"]) == 0
+    assert len(new["uncertain"]) == 1
+    assert new["uncertain"][0]["name"] == "Paris"
+    assert reclass == 1
+
+
+# ── refine_entities ─────────────────────────────────────────────────────
+
+
+def _sample_detected():
+    return {
+        "people": [
+            {
+                "name": "Igor",
+                "type": "person",
+                "confidence": 0.99,
+                "frequency": 100,
+                "signals": ["git"],
+            }
+        ],
+        "projects": [
+            {
+                "name": "Foo",
+                "type": "project",
+                "confidence": 0.7,
+                "frequency": 5,
+                "signals": ["regex"],
+            }
+        ],
+        "uncertain": [
+            {
+                "name": "Never",
+                "type": "uncertain",
+                "confidence": 0.4,
+                "frequency": 10,
+                "signals": [],
+            },
+            {
+                "name": "Alice",
+                "type": "uncertain",
+                "confidence": 0.4,
+                "frequency": 5,
+                "signals": [],
+            },
+        ],
+    }
+
+
+def test_refine_entities_end_to_end_with_fake_provider():
+    provider = FakeProvider(
+        response_text=(
+            '{"classifications": ['
+            '{"name": "Foo", "label": "PROJECT", "reason": "real"},'
+            '{"name": "Never", "label": "COMMON_WORD"},'
+            '{"name": "Alice", "label": "PERSON", "reason": "name"}'
+            "]}"
+        )
+    )
+    result = refine_entities(
+        _sample_detected(),
+        corpus_text="Alice said hi. Foo was shipped. Never gonna.",
+        provider=provider,
+        show_progress=False,
+    )
+    assert result.batches_total == 1
+    assert result.batches_completed == 1
+    assert not result.cancelled
+    # Alice → people, Never → dropped, Foo stays in projects
+    names_in_people = [e["name"] for e in result.merged["people"]]
+    assert "Alice" in names_in_people
+    assert "Igor" in names_in_people  # untouched
+    assert "Never" not in [e["name"] for e in result.merged["uncertain"]]
+    assert result.dropped == 1
+
+
+def test_refine_entities_skips_high_confidence_projects():
+    """Manifest-backed projects (conf >= 0.95) aren't sent to the LLM."""
+    detected = {
+        "people": [],
+        "projects": [
+            {
+                "name": "manifest-backed",
+                "type": "project",
+                "confidence": 0.99,
+                "frequency": 50,
+                "signals": ["pyproject.toml"],
+            }
+        ],
+        "uncertain": [],
+    }
+    provider = FakeProvider(response_text='{"classifications": []}')
+    refine_entities(detected, "", provider, show_progress=False)
+    # Should not have called the LLM at all
+    assert provider.call_count == 0
+
+
+def test_refine_entities_empty_candidates_returns_noop():
+    detected = {"people": [], "projects": [], "uncertain": []}
+    provider = FakeProvider()
+    result = refine_entities(detected, "", provider, show_progress=False)
+    assert result.batches_total == 0
+    assert result.reclassified == 0
+    assert result.merged == detected
+
+
+def test_refine_entities_handles_batch_error_gracefully():
+    provider = FakeProvider(should_raise=LLMError("transport broke"))
+    result = refine_entities(
+        _sample_detected(),
+        corpus_text="",
+        provider=provider,
+        show_progress=False,
+    )
+    assert result.errors
+    assert "transport broke" in result.errors[0]
+    # Detected unchanged (no successful decisions)
+    assert result.reclassified == 0
+    assert result.cancelled is False
+
+
+def test_refine_entities_ctrl_c_returns_partial():
+    """Ctrl-C during refinement marks cancelled=True and returns partial result."""
+    # Two batches' worth of candidates
+    detected = {
+        "people": [],
+        "projects": [],
+        "uncertain": [
+            {
+                "name": f"Cand{i}",
+                "type": "uncertain",
+                "confidence": 0.4,
+                "frequency": 3,
+                "signals": [],
+            }
+            for i in range(50)
+        ],
+    }
+    provider = FakeProvider(
+        response_text='{"classifications": []}',
+        interrupt_on_call=2,  # interrupt on second batch
+    )
+    result = refine_entities(detected, "", provider, batch_size=25, show_progress=False)
+    assert result.cancelled is True
+    assert result.batches_completed == 1  # first batch finished; second interrupted
+    assert result.batches_total == 2
+
+
+def test_refine_entities_malformed_response_recorded_as_error():
+    provider = FakeProvider(response_text="not json")
+    result = refine_entities(_sample_detected(), "", provider, show_progress=False)
+    assert any("could not parse" in e for e in result.errors)
+
+
+# ── collect_corpus_text ─────────────────────────────────────────────────
+
+
+def test_collect_corpus_text_reads_prose_files(tmp_path):
+    (tmp_path / "a.md").write_text("hello world")
+    (tmp_path / "b.txt").write_text("more prose")
+    (tmp_path / "c.py").write_text("import os")  # not prose, skipped
+    text = collect_corpus_text(str(tmp_path))
+    assert "hello world" in text
+    assert "more prose" in text
+    assert "import os" not in text
+
+
+def test_collect_corpus_text_prefers_recent(tmp_path):
+    import os
+    import time
+
+    old = tmp_path / "old.md"
+    old.write_text("OLD_CONTENT")
+    time.sleep(0.01)
+    new = tmp_path / "new.md"
+    new.write_text("NEW_CONTENT")
+    # Force old to be older still
+    old_mtime = old.stat().st_mtime - 3600
+    os.utime(old, (old_mtime, old_mtime))
+
+    text = collect_corpus_text(str(tmp_path), max_files=1)
+    assert "NEW_CONTENT" in text
+    assert "OLD_CONTENT" not in text
+
+
+def test_collect_corpus_text_missing_dir_returns_empty(tmp_path):
+    assert collect_corpus_text(str(tmp_path / "nope")) == ""
+
+
+def test_collect_corpus_text_caps_bytes_per_file(tmp_path):
+    big = tmp_path / "big.md"
+    big.write_text("x" * 100_000)
+    text = collect_corpus_text(str(tmp_path), max_files=1, max_bytes_per_file=500)
+    assert len(text) <= 600  # 500 + newlines