From 10a743d5d83ef561f8dfccd5079eac9cdd6fe014 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:46:59 -0300 Subject: [PATCH] feat(llm): interactive entity refinement with batching and cancellation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Takes the candidate set produced by phase-1 detection (manifests, git authors, regex on prose) and asks an LLM to reclassify each candidate as PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS. Scale approach: never feed the raw corpus to the LLM. For each candidate, collect up to 3 context lines from sampled prose, cap each at 240 chars, batch 25 candidates per call. Keeps total input around 50-100K tokens even on large corpora and completes in a few minutes on a 4B local model. Interactive UX: - Stderr progress bar with the current candidate name, updates per-batch. - Ctrl-C interrupts cleanly: returns a RefineResult with `cancelled=True` and whatever was classified before the interrupt. The partial result is safe to pass straight to confirm_entities. - Per-batch errors (transport, parse) are recorded in `errors` and don't abort the whole run. Refinement scope: only `uncertain` and low-confidence `projects` entries are sent. Manifest-backed projects (conf >= 0.95) and git- authored people are already authoritative and skip the LLM. Response parser is defensive — accepts `label` or `type` keys, lowercase/uppercase variants, top-level list or wrapped object, and strips markdown code fences. Unknown labels become AMBIGUOUS so the user reviews them rather than silently accepting a bad classification. `collect_corpus_text` provides a simple stratified prose sampler (recent first, capped per-file) so callers don't need to build their own corpus window. 28 tests with a FakeProvider (no network). Covers context collection, prompt building, response parsing variants, classification apply, end-to-end refine, and Ctrl-C partial-result behavior. --- mempalace/llm_refine.py | 368 ++++++++++++++++++++++++++++++++ tests/test_llm_refine.py | 446 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 814 insertions(+) create mode 100644 mempalace/llm_refine.py create mode 100644 tests/test_llm_refine.py diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py new file mode 100644 index 0000000..91a950c --- /dev/null +++ b/mempalace/llm_refine.py @@ -0,0 +1,368 @@ +""" +llm_refine.py — Optional LLM refinement of regex-detected entities. + +Takes the candidate set produced by phase-1 detection (manifests, git +authors, regex on prose) and asks an LLM to reclassify each candidate as +PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS. + +Design constraints: +- Opt-in. Default init path never imports this module. +- Local-first by default (Ollama). +- Interactive UX: visible progress, clean cancellation (Ctrl-C returns + whatever was classified before the interrupt). +- Don't feed the raw corpus to the LLM — feed candidates + a few sampled + context lines each. Keeps total input to ~50-100K tokens even for huge + prose corpora. + +Public: + refine_entities(detected, corpus_text, provider, ...) -> dict +""" + +from __future__ import annotations + +import json +import re +import sys +from dataclasses import dataclass + +from mempalace.llm_client import LLMError, LLMProvider + + +BATCH_SIZE = 25 # candidates per LLM call; tuned for 4B local models +CONTEXT_LINES_PER_CANDIDATE = 3 +CONTEXT_WINDOW_CHARS = 240 # max chars per context line to keep tokens bounded + +# Valid labels the LLM is allowed to return. Anything else is treated as +# AMBIGUOUS so the user reviews it. +VALID_LABELS = {"PERSON", "PROJECT", "TOPIC", "COMMON_WORD", "AMBIGUOUS"} + + +SYSTEM_PROMPT = """You are helping organize a user's memory palace by classifying capitalized tokens found in their files. + +For each candidate, pick exactly ONE label: +- PERSON: a specific real person the user knows (colleague, family, character they write about) +- PROJECT: a named product, codebase, or effort the user works on +- TOPIC: a recurring theme or subject (not a person, not a project) — cities, technologies, concepts +- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never") +- AMBIGUOUS: context is insufficient to decide between two of the above + +Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON. + +Respond with JSON only. Schema: +{"classifications": [{"name": "", "label": "