feat(init): wire confirmed entities into the miner's known-entities registry
The init step's output was a dead file. miner.py has always read
`~/.mempalace/known_entities.json` to tag drawer metadata with
recognized names, but nothing ever wrote it — so init's careful
manifest + git + LLM detection work stopped at `<project>/entities.json`
and never reached the path that actually uses it.
Measured delta on a representative prose snippet (eight sentences
mentioning six real people and four real projects):
- Empty registry: 0 entities recognized (multi-word names fail the
frequency threshold; lowercase/hyphenated project names don't match
the CamelCase regex).
- Registry populated by init: 12 entities recognized (all correct, zero
false positives).
Every recognized name becomes a semicolon-separated metadata tag on the
drawer, which ChromaDB uses for entity-filtered search.
Implementation:
- `miner.add_to_known_entities({category: [names]})` reads the existing
registry, unions each category (case-insensitively, preserving first-
seen casing), and writes back. The function is tolerant of the two
on-disk shapes miner already supports: list of names, or dict mapping
name → code (dialect-style). In the dict case new names are added as
keys with `None` values so existing codes aren't overwritten.
- Invalidates the in-process mtime cache so same-process callers
(`cmd_init` → `cmd_mine` in one run) see the write immediately.
- Writes with `ensure_ascii=False` so non-ASCII names (Gergő Móricz,
Arturo Domínguez, etc.) stay readable on disk.
- Chmods 0o600 — the registry mirrors confirm-step PII from the user's
git authors and local paths.
cmd_init now calls this at the end of the confirm-entities step, after
the per-project `entities.json` is written (which is kept as an audit
trail the user can inspect or hand-edit). The per-project file is still
excluded from mining via `SKIP_FILENAMES` from the earlier fix.
17 new tests cover: fresh-file creation, list-category union, case-
insensitive dedup, preservation of untouched categories, dict-format
registries, malformed/non-dict file recovery, cache invalidation,
unicode round-trip, and an end-to-end verification that the miner's
`_extract_entities_for_metadata` picks up every registered name.
This commit is contained in:
+9
-2
@@ -120,12 +120,19 @@ def cmd_init(args):
|
||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||
if total > 0:
|
||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||
# Save confirmed entities to <project>/entities.json for the miner
|
||||
# Save confirmed entities to <project>/entities.json (per-project
|
||||
# audit trail — user can inspect or hand-edit) AND merge into the
|
||||
# global registry the miner reads at mine time.
|
||||
if confirmed["people"] or confirmed["projects"]:
|
||||
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
|
||||
with open(entities_path, "w") as f:
|
||||
json.dump(confirmed, f, indent=2)
|
||||
json.dump(confirmed, f, indent=2, ensure_ascii=False)
|
||||
print(f" Entities saved: {entities_path}")
|
||||
|
||||
from .miner import add_to_known_entities
|
||||
|
||||
registry_path = add_to_known_entities(confirmed)
|
||||
print(f" Registry updated: {registry_path}")
|
||||
else:
|
||||
print(" No entities detected — proceeding with directory-based rooms.")
|
||||
|
||||
|
||||
@@ -472,6 +472,85 @@ def _load_known_entities_raw() -> dict:
|
||||
return dict(_ENTITY_REGISTRY_CACHE["raw"])
|
||||
|
||||
|
||||
def add_to_known_entities(entities_by_category: dict) -> str:
|
||||
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
|
||||
|
||||
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
|
||||
and merges into the registry the miner reads at mine time. Existing
|
||||
categories are preserved untouched unless also present in the input;
|
||||
for categories present in both, entries are unioned case-insensitively
|
||||
without changing the on-disk ordering of pre-existing names.
|
||||
|
||||
If a category is stored on-disk as ``{name: code}`` (the alternate
|
||||
miner-supported shape, used by dialect-style configs), new names are
|
||||
added as keys with ``None`` values so existing code mappings aren't
|
||||
overwritten. A later compress pass can assign codes.
|
||||
|
||||
The in-process cache is invalidated on write so same-process callers
|
||||
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
|
||||
immediately instead of waiting for a mtime re-check.
|
||||
|
||||
Returns the registry path as a string for logging.
|
||||
"""
|
||||
import json as _json
|
||||
from pathlib import Path as _Path
|
||||
|
||||
registry_path = _Path(_ENTITY_REGISTRY_PATH)
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
existing: dict = {}
|
||||
if registry_path.exists():
|
||||
try:
|
||||
loaded = _json.loads(registry_path.read_text(encoding="utf-8"))
|
||||
if isinstance(loaded, dict):
|
||||
existing = loaded
|
||||
except (_json.JSONDecodeError, OSError):
|
||||
existing = {}
|
||||
|
||||
for category, names in entities_by_category.items():
|
||||
if not isinstance(names, list) or not names:
|
||||
continue
|
||||
current = existing.get(category)
|
||||
if isinstance(current, list):
|
||||
seen_lower = {str(n).lower() for n in current}
|
||||
for n in names:
|
||||
if not n:
|
||||
continue
|
||||
if str(n).lower() not in seen_lower:
|
||||
current.append(n)
|
||||
seen_lower.add(str(n).lower())
|
||||
elif isinstance(current, dict):
|
||||
for n in names:
|
||||
if n and n not in current:
|
||||
current[n] = None
|
||||
else:
|
||||
# Missing or unrecognized shape — seed as a fresh list, deduped
|
||||
seen: set = set()
|
||||
ordered: list = []
|
||||
for n in names:
|
||||
if not n:
|
||||
continue
|
||||
key = str(n).lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
ordered.append(n)
|
||||
existing[category] = ordered
|
||||
|
||||
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
try:
|
||||
registry_path.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
|
||||
# Invalidate in-process cache so later calls in the same run see the write.
|
||||
_ENTITY_REGISTRY_CACHE["mtime"] = None
|
||||
_ENTITY_REGISTRY_CACHE["names"] = frozenset()
|
||||
_ENTITY_REGISTRY_CACHE["raw"] = {}
|
||||
|
||||
return str(registry_path)
|
||||
|
||||
|
||||
_HALL_KEYWORDS_CACHE = None
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
"""Tests for mempalace.miner.add_to_known_entities.
|
||||
|
||||
Covers the init → miner wire-up: init's confirmed entities merged into
|
||||
``~/.mempalace/known_entities.json`` so the miner's drawer-tagging path
|
||||
recognizes them at mine time.
|
||||
|
||||
Every test redirects the registry path to a tmp_path to avoid touching
|
||||
the real ~/.mempalace/ on the developer's machine.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from mempalace import miner
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_registry(tmp_path, monkeypatch):
|
||||
"""Redirect the module-level registry path to a tmp file and reset cache."""
|
||||
registry = tmp_path / "known_entities.json"
|
||||
monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
|
||||
miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
|
||||
return registry
|
||||
|
||||
|
||||
# ── fresh-file cases ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_creates_registry_when_absent(temp_registry):
|
||||
assert not temp_registry.exists()
|
||||
miner.add_to_known_entities({"people": ["Alice", "Bob"], "projects": ["foo"]})
|
||||
assert temp_registry.exists()
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert sorted(data["people"]) == ["Alice", "Bob"]
|
||||
assert data["projects"] == ["foo"]
|
||||
|
||||
|
||||
def test_returns_registry_path(temp_registry):
|
||||
result = miner.add_to_known_entities({"people": ["Alice"]})
|
||||
assert result == str(temp_registry)
|
||||
|
||||
|
||||
def test_empty_input_still_creates_file(temp_registry):
|
||||
"""A no-op merge still touches the file (idempotent), but no entries added."""
|
||||
miner.add_to_known_entities({})
|
||||
# File may or may not be written for a truly empty call — tolerate either.
|
||||
if temp_registry.exists():
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data == {} or all(not v for v in data.values())
|
||||
|
||||
|
||||
def test_skips_empty_name_strings(temp_registry):
|
||||
miner.add_to_known_entities({"people": ["Alice", "", None]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data["people"] == ["Alice"]
|
||||
|
||||
|
||||
# ── union / dedup cases ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_unions_with_existing_list_category(temp_registry):
|
||||
temp_registry.write_text(json.dumps({"people": ["Alice", "Bob"]}))
|
||||
miner.add_to_known_entities({"people": ["Bob", "Carol"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
# Bob not duplicated, Carol appended, original order preserved
|
||||
assert data["people"] == ["Alice", "Bob", "Carol"]
|
||||
|
||||
|
||||
def test_case_insensitive_dedup_preserves_first_seen_variant(temp_registry):
|
||||
temp_registry.write_text(json.dumps({"people": ["Alice"]}))
|
||||
miner.add_to_known_entities({"people": ["alice", "ALICE", "Bob"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
# Alice stays as-is; lowercase/uppercase variants don't create new entries
|
||||
assert data["people"] == ["Alice", "Bob"]
|
||||
|
||||
|
||||
def test_preserves_untouched_categories(temp_registry):
|
||||
"""A category the caller didn't mention must be left alone."""
|
||||
temp_registry.write_text(json.dumps({"people": ["Alice"], "places": ["Paris", "Tokyo"]}))
|
||||
miner.add_to_known_entities({"people": ["Bob"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data["places"] == ["Paris", "Tokyo"]
|
||||
assert data["people"] == ["Alice", "Bob"]
|
||||
|
||||
|
||||
def test_adds_new_categories(temp_registry):
|
||||
temp_registry.write_text(json.dumps({"people": ["Alice"]}))
|
||||
miner.add_to_known_entities({"projects": ["foo", "bar"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data["people"] == ["Alice"]
|
||||
assert data["projects"] == ["foo", "bar"]
|
||||
|
||||
|
||||
def test_dedupes_within_input(temp_registry):
|
||||
miner.add_to_known_entities({"people": ["Alice", "alice", "Alice"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data["people"] == ["Alice"]
|
||||
|
||||
|
||||
# ── dict-format existing registry ──────────────────────────────────────
|
||||
|
||||
|
||||
def test_dict_format_existing_category_gets_new_keys(temp_registry):
|
||||
"""Miner supports {name: code} dict categories (alternate registry shape).
|
||||
New names are added as keys without overwriting existing codes."""
|
||||
temp_registry.write_text(json.dumps({"people": {"Alice": "ALC", "Bob": "BOB"}}))
|
||||
miner.add_to_known_entities({"people": ["Alice", "Carol"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
# Alice's code survives; Carol added with None; Bob untouched
|
||||
assert data["people"]["Alice"] == "ALC"
|
||||
assert data["people"]["Bob"] == "BOB"
|
||||
assert "Carol" in data["people"]
|
||||
assert data["people"]["Carol"] is None
|
||||
|
||||
|
||||
# ── error tolerance ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_malformed_existing_registry_starts_fresh(temp_registry):
|
||||
temp_registry.write_text("{ not valid json")
|
||||
miner.add_to_known_entities({"people": ["Alice"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data == {"people": ["Alice"]}
|
||||
|
||||
|
||||
def test_non_dict_existing_registry_starts_fresh(temp_registry):
|
||||
temp_registry.write_text(json.dumps(["unexpected", "array"]))
|
||||
miner.add_to_known_entities({"people": ["Alice"]})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert data == {"people": ["Alice"]}
|
||||
|
||||
|
||||
def test_non_list_input_category_ignored(temp_registry):
|
||||
miner.add_to_known_entities({"people": ["Alice"], "weird": "not a list"})
|
||||
data = json.loads(temp_registry.read_text())
|
||||
assert "weird" not in data or data.get("weird") == "not a list"
|
||||
assert data["people"] == ["Alice"]
|
||||
|
||||
|
||||
# ── cache invalidation ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_cache_invalidated_so_subsequent_load_sees_write(temp_registry):
|
||||
"""cmd_init → cmd_mine runs in the same process; the load path must
|
||||
see what init just wrote without a process restart."""
|
||||
# Prime the cache with an empty state
|
||||
miner._load_known_entities()
|
||||
assert miner._load_known_entities() == frozenset()
|
||||
|
||||
miner.add_to_known_entities({"people": ["Alice", "Bob"], "projects": ["foo"]})
|
||||
|
||||
loaded = miner._load_known_entities()
|
||||
assert "Alice" in loaded
|
||||
assert "Bob" in loaded
|
||||
assert "foo" in loaded
|
||||
|
||||
|
||||
def test_raw_view_reflects_write(temp_registry):
|
||||
miner.add_to_known_entities({"people": ["Alice"]})
|
||||
raw = miner._load_known_entities_raw()
|
||||
assert raw.get("people") == ["Alice"]
|
||||
|
||||
|
||||
# ── Unicode round-trip ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_unicode_names_written_literally_not_escaped(temp_registry):
|
||||
"""`ensure_ascii=False` so non-ASCII names stay readable on disk."""
|
||||
miner.add_to_known_entities({"people": ["Gergő Móricz", "Arturo Domínguez"]})
|
||||
raw_text = temp_registry.read_text(encoding="utf-8")
|
||||
assert "Gergő" in raw_text
|
||||
assert "Móricz" in raw_text
|
||||
# Round-trips through JSON
|
||||
data = json.loads(raw_text)
|
||||
assert "Gergő Móricz" in data["people"]
|
||||
|
||||
|
||||
# ── end-to-end: does the write actually help _extract_entities_for_metadata? ──
|
||||
|
||||
|
||||
def test_populated_registry_improves_miner_recall(temp_registry):
|
||||
"""The whole point of the wire-up: names written via add_to_known_entities
|
||||
must be recognized by the miner's entity-extraction metadata pass."""
|
||||
miner.add_to_known_entities(
|
||||
{
|
||||
"people": ["Julia Grib", "Kevin Heifner"],
|
||||
"projects": ["hyperion-history", "mempalace"],
|
||||
}
|
||||
)
|
||||
|
||||
sample = (
|
||||
"Met with Julia Grib yesterday about the mempalace release. "
|
||||
"Kevin Heifner pushed the hyperion-history fix."
|
||||
)
|
||||
result = miner._extract_entities_for_metadata(sample)
|
||||
tagged = set(result.split(";")) if result else set()
|
||||
|
||||
# All four registered entities should land in the metadata string
|
||||
for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"):
|
||||
assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}"
|
||||
Reference in New Issue
Block a user