feat(init): wire --llm flag and convo_scanner into discover_entities
Extends the init orchestrator to consume two new signal sources:
1. Claude Code conversation dirs: when the target is a
`~/.claude/projects/` root, convo_scanner contributes ProjectInfo
entries alongside the git/manifest projects. Dedup is by name,
preferring the entry with more user-authored activity.
2. Optional LLM refinement: when --llm is passed, discover_entities
constructs the provider, validates availability, and runs
llm_refine.refine_entities on the merged candidates. Status
summary (reclassified / dropped / cancelled / batch errors)
prints to stderr.
New init flags (opt-in, default remains zero-API):
- --llm: enable refinement
- --llm-provider: ollama (default) | openai-compat | anthropic
- --llm-model: default gemma4:e4b for Ollama
- --llm-endpoint: URL (required for openai-compat)
- --llm-api-key: falls back to env ($ANTHROPIC_API_KEY or
$OPENAI_API_KEY depending on provider)
Provider check_available runs before the scan, so the user sees an
immediate error ("Run: ollama pull <model>" or "ANTHROPIC_API_KEY not
set") rather than a mid-scan failure.
This commit is contained in:
+64
-2
@@ -86,12 +86,37 @@ def cmd_init(args):
|
|||||||
languages = cfg.entity_languages
|
languages = cfg.entity_languages
|
||||||
languages_tuple = tuple(languages)
|
languages_tuple = tuple(languages)
|
||||||
|
|
||||||
|
# Optional phase-2 LLM provider (opt-in via --llm).
|
||||||
|
llm_provider = None
|
||||||
|
if getattr(args, "llm", False):
|
||||||
|
from .llm_client import LLMError, get_provider
|
||||||
|
|
||||||
|
try:
|
||||||
|
llm_provider = get_provider(
|
||||||
|
name=args.llm_provider,
|
||||||
|
model=args.llm_model,
|
||||||
|
endpoint=args.llm_endpoint,
|
||||||
|
api_key=args.llm_api_key,
|
||||||
|
)
|
||||||
|
except LLMError as e:
|
||||||
|
print(f" ERROR: {e}", file=sys.stderr)
|
||||||
|
sys.exit(2)
|
||||||
|
ok, msg = llm_provider.check_available()
|
||||||
|
if not ok:
|
||||||
|
print(
|
||||||
|
f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(2)
|
||||||
|
print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
|
||||||
|
|
||||||
# Pass 1: discover entities — manifests + git authors first, prose detection
|
# Pass 1: discover entities — manifests + git authors first, prose detection
|
||||||
# as supplement for names mentioned only in docs/notes.
|
# as supplement for names mentioned only in docs/notes. Optional phase-2
|
||||||
|
# LLM refinement runs inside discover_entities when llm_provider is given.
|
||||||
print(f"\n Scanning for entities in: {args.dir}")
|
print(f"\n Scanning for entities in: {args.dir}")
|
||||||
if languages_tuple != ("en",):
|
if languages_tuple != ("en",):
|
||||||
print(f" Languages: {', '.join(languages_tuple)}")
|
print(f" Languages: {', '.join(languages_tuple)}")
|
||||||
detected = discover_entities(args.dir, languages=languages_tuple)
|
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
|
||||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||||
if total > 0:
|
if total > 0:
|
||||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||||
@@ -550,6 +575,43 @@ def main():
|
|||||||
"When given, the value is also persisted to config.json."
|
"When given, the value is also persisted to config.json."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
p_init.add_argument(
|
||||||
|
"--llm",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Enable LLM-assisted entity refinement (opt-in, local-first). "
|
||||||
|
"Runs after manifest/git/regex detection, asking the configured "
|
||||||
|
"provider to reclassify ambiguous candidates. "
|
||||||
|
"Ctrl-C during refinement returns partial results."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p_init.add_argument(
|
||||||
|
"--llm-provider",
|
||||||
|
default="ollama",
|
||||||
|
choices=["ollama", "openai-compat", "anthropic"],
|
||||||
|
help="LLM provider (default: ollama). Use --llm to enable.",
|
||||||
|
)
|
||||||
|
p_init.add_argument(
|
||||||
|
"--llm-model",
|
||||||
|
default="gemma4:e4b",
|
||||||
|
help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
|
||||||
|
)
|
||||||
|
p_init.add_argument(
|
||||||
|
"--llm-endpoint",
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"Provider endpoint URL. Default for Ollama: http://localhost:11434. "
|
||||||
|
"Required for openai-compat."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p_init.add_argument(
|
||||||
|
"--llm-api-key",
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
|
||||||
|
"for openai-compat, defaults to $OPENAI_API_KEY."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# mine
|
# mine
|
||||||
p_mine = sub.add_parser("mine", help="Mine files into the palace")
|
p_mine = sub.add_parser("mine", help="Mine files into the palace")
|
||||||
|
|||||||
@@ -574,6 +574,8 @@ def discover_entities(
|
|||||||
prose_file_cap: int = 10,
|
prose_file_cap: int = 10,
|
||||||
project_cap: int = 15,
|
project_cap: int = 15,
|
||||||
people_cap: int = 15,
|
people_cap: int = 15,
|
||||||
|
llm_provider: object = None,
|
||||||
|
show_progress: bool = True,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Top-level entity discovery: real signals first, prose detection second.
|
"""Top-level entity discovery: real signals first, prose detection second.
|
||||||
|
|
||||||
@@ -584,10 +586,39 @@ def discover_entities(
|
|||||||
1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod)
|
1. Package manifests (package.json, pyproject.toml, Cargo.toml, go.mod)
|
||||||
→ canonical project names
|
→ canonical project names
|
||||||
2. Git commit authors → real people with real commit counts
|
2. Git commit authors → real people with real commit counts
|
||||||
3. Regex entity detection on prose files → supplementary names only
|
3. Claude Code conversation dirs (~/.claude/projects/) → per-session
|
||||||
|
project names (pulled from each session's ``cwd`` metadata)
|
||||||
|
4. Regex entity detection on prose files → supplementary names only
|
||||||
mentioned in docs/notes (not code)
|
mentioned in docs/notes (not code)
|
||||||
|
5. Optional LLM refinement pass — reclassifies ambiguous candidates
|
||||||
|
using the caller-supplied provider
|
||||||
|
|
||||||
|
Passing ``llm_provider`` enables phase-2 refinement. The caller is
|
||||||
|
responsible for constructing the provider (``llm_client.get_provider``)
|
||||||
|
and confirming availability. Refinement is blocking-interactive:
|
||||||
|
progress prints to stderr; Ctrl-C returns partial results.
|
||||||
"""
|
"""
|
||||||
projects, people = scan(project_dir)
|
projects, people = scan(project_dir)
|
||||||
|
|
||||||
|
# If the target is a Claude Code conversations root, extract per-project
|
||||||
|
# entries from there too. Same ProjectInfo shape, so dedup logic works.
|
||||||
|
from mempalace.convo_scanner import is_claude_projects_root, scan_claude_projects
|
||||||
|
|
||||||
|
root_path = Path(project_dir).expanduser().resolve()
|
||||||
|
if is_claude_projects_root(root_path):
|
||||||
|
convo_projects = scan_claude_projects(root_path)
|
||||||
|
# Dedup by name against the git-manifest list, preferring entries with
|
||||||
|
# more user_commits as signal strength.
|
||||||
|
by_name: dict[str, ProjectInfo] = {p.name: p for p in projects}
|
||||||
|
for cp in convo_projects:
|
||||||
|
existing = by_name.get(cp.name)
|
||||||
|
if existing is None or cp.user_commits > existing.user_commits:
|
||||||
|
by_name[cp.name] = cp
|
||||||
|
projects = sorted(
|
||||||
|
by_name.values(),
|
||||||
|
key=lambda p: (not p.is_mine, -p.user_commits, -p.total_commits, p.name),
|
||||||
|
)
|
||||||
|
|
||||||
real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap)
|
real_signal = to_detected_dict(projects, people, project_cap=project_cap, people_cap=people_cap)
|
||||||
|
|
||||||
# Secondary pass: prose-only extraction catches names mentioned in docs
|
# Secondary pass: prose-only extraction catches names mentioned in docs
|
||||||
@@ -605,7 +636,31 @@ def discover_entities(
|
|||||||
# That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
|
# That bucket is mostly noise (common words, CamelCase tech terms, etc.) and
|
||||||
# adding it to the review flow just makes the user do triage we can skip.
|
# adding it to the review flow just makes the user do triage we can skip.
|
||||||
has_real_signal = bool(projects) or bool(people)
|
has_real_signal = bool(projects) or bool(people)
|
||||||
return _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
|
merged = _merge_detected(real_signal, prose_detected, drop_secondary_uncertain=has_real_signal)
|
||||||
|
|
||||||
|
# Optional phase 2: LLM refinement.
|
||||||
|
if llm_provider is not None:
|
||||||
|
from mempalace.llm_refine import collect_corpus_text, refine_entities
|
||||||
|
|
||||||
|
corpus = collect_corpus_text(str(project_dir))
|
||||||
|
result = refine_entities(merged, corpus, llm_provider, show_progress=show_progress)
|
||||||
|
if show_progress:
|
||||||
|
status_bits = []
|
||||||
|
if result.cancelled:
|
||||||
|
status_bits.append("cancelled")
|
||||||
|
if result.reclassified:
|
||||||
|
status_bits.append(f"reclassified {result.reclassified}")
|
||||||
|
if result.dropped:
|
||||||
|
status_bits.append(f"dropped {result.dropped}")
|
||||||
|
if result.errors:
|
||||||
|
status_bits.append(f"{len(result.errors)} batch error(s)")
|
||||||
|
if status_bits:
|
||||||
|
import sys as _sys
|
||||||
|
|
||||||
|
print(f" LLM refine: {', '.join(status_bits)}", file=_sys.stderr)
|
||||||
|
merged = result.merged
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
# ==================== CLI ====================
|
# ==================== CLI ====================
|
||||||
|
|||||||
@@ -1174,6 +1174,7 @@ source = { editable = "." }
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "chromadb" },
|
{ name = "chromadb" },
|
||||||
{ name = "pyyaml" },
|
{ name = "pyyaml" },
|
||||||
|
{ name = "tomli", marker = "python_full_version < '3.11'" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
@@ -1206,6 +1207,7 @@ requires-dist = [
|
|||||||
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
|
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
|
||||||
{ name = "pyyaml", specifier = ">=6.0,<7" },
|
{ name = "pyyaml", specifier = ">=6.0,<7" },
|
||||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
|
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
|
||||||
|
{ name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0" },
|
||||||
]
|
]
|
||||||
provides-extras = ["dev", "spellcheck"]
|
provides-extras = ["dev", "spellcheck"]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user