diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py new file mode 100644 index 0000000..91a950c --- /dev/null +++ b/mempalace/llm_refine.py @@ -0,0 +1,368 @@ +""" +llm_refine.py — Optional LLM refinement of regex-detected entities. + +Takes the candidate set produced by phase-1 detection (manifests, git +authors, regex on prose) and asks an LLM to reclassify each candidate as +PERSON / PROJECT / TOPIC / COMMON_WORD / AMBIGUOUS. + +Design constraints: +- Opt-in. Default init path never imports this module. +- Local-first by default (Ollama). +- Interactive UX: visible progress, clean cancellation (Ctrl-C returns + whatever was classified before the interrupt). +- Don't feed the raw corpus to the LLM — feed candidates + a few sampled + context lines each. Keeps total input to ~50-100K tokens even for huge + prose corpora. + +Public: + refine_entities(detected, corpus_text, provider, ...) -> dict +""" + +from __future__ import annotations + +import json +import re +import sys +from dataclasses import dataclass + +from mempalace.llm_client import LLMError, LLMProvider + + +BATCH_SIZE = 25 # candidates per LLM call; tuned for 4B local models +CONTEXT_LINES_PER_CANDIDATE = 3 +CONTEXT_WINDOW_CHARS = 240 # max chars per context line to keep tokens bounded + +# Valid labels the LLM is allowed to return. Anything else is treated as +# AMBIGUOUS so the user reviews it. +VALID_LABELS = {"PERSON", "PROJECT", "TOPIC", "COMMON_WORD", "AMBIGUOUS"} + + +SYSTEM_PROMPT = """You are helping organize a user's memory palace by classifying capitalized tokens found in their files. + +For each candidate, pick exactly ONE label: +- PERSON: a specific real person the user knows (colleague, family, character they write about) +- PROJECT: a named product, codebase, or effort the user works on +- TOPIC: a recurring theme or subject (not a person, not a project) — cities, technologies, concepts +- COMMON_WORD: an English word, verb, or fragment that isn't a named entity at all (e.g. "Created", "Before", "Never") +- AMBIGUOUS: context is insufficient to decide between two of the above + +Use the provided context lines to disambiguate. A capitalized word that only appears in metadata ("Created: 2026-04-24") is COMMON_WORD. A name that appears with pronouns and dialogue is PERSON. + +Respond with JSON only. Schema: +{"classifications": [{"name": "", "label": "