feat: entity metadata + diary ingest + BM25 hybrid search

Three features that close the gap between the architecture docs
and the actual codebase:

1. Entity metadata on drawers and closets
   - _extract_entities_for_metadata() pulls names from known_entities.json
     + proper nouns appearing 2+ times
   - Stamped as "entities" field in ChromaDB metadata
   - Enables filterable search by person/project name

2. Day-based diary ingest (diary_ingest.py)
   - ONE drawer per day, upserted as the day grows
   - Closets pack topics atomically, never split mid-topic
   - Tracks entry count in state file, only processes new entries
   - Usage: python -m mempalace.diary_ingest --dir ~/summaries

3. BM25 hybrid search in searcher.py
   - _bm25_score() keyword matching complements vector similarity
   - _hybrid_rank() combines both signals (60% vector, 40% BM25)
   - Catches exact name/term matches that embeddings miss
   - Applied to both closet-first and direct drawer search paths

689/689 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
MSL
2026-04-13 01:47:19 -07:00
committed by Igor Lins e Silva
parent ee60cad652
commit f935e85ead
3 changed files with 282 additions and 4 deletions
+173
View File
@@ -0,0 +1,173 @@
"""
diary_ingest.py — Ingest daily summary files into the palace.
Architecture:
- ONE drawer per day — full verbatim content, upserted as the day grows
- Closets pack topics up to 1500 chars, never split mid-topic
- Only new entries are processed (tracks entry count in state file)
- Entities extracted and stamped on metadata for filterable search
Usage:
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace
python -m mempalace.diary_ingest --dir ~/daily_summaries --palace ~/.mempalace/palace --force
"""
import hashlib
import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from .palace import (
get_collection,
get_closets_collection,
build_closet_lines,
upsert_closet_lines,
CLOSET_CHAR_LIMIT,
)
from .miner import _extract_entities_for_metadata
DIARY_ENTRY_RE = re.compile(r"^## .+", re.MULTILINE)
def _split_entries(text):
"""Split diary text into (header, body) pairs per ## entry."""
parts = DIARY_ENTRY_RE.split(text)
headers = DIARY_ENTRY_RE.findall(text)
entries = []
for i, header in enumerate(headers):
body = parts[i + 1] if i + 1 < len(parts) else ""
entries.append((header.strip(), body.strip()))
return entries
def ingest_diaries(
diary_dir,
palace_path,
wing="diary",
force=False,
):
"""Ingest daily summary files into the palace.
Each date file gets ONE drawer (upserted as day grows) and
closets that pack topics atomically up to 1500 chars.
"""
diary_dir = Path(diary_dir).expanduser().resolve()
if not diary_dir.exists():
print(f"Diary directory not found: {diary_dir}")
return
diary_files = sorted(diary_dir.glob("*.md"))
if not diary_files:
print(f"No .md files in {diary_dir}")
return
# State tracks which entries have been closeted per file
state_file = diary_dir / ".diary_ingest_state.json"
state = {} if force else (
json.loads(state_file.read_text()) if state_file.exists() else {}
)
drawers_col = get_collection(palace_path)
closets_col = get_closets_collection(palace_path)
days_updated = 0
closets_created = 0
for diary_path in diary_files:
text = diary_path.read_text(encoding="utf-8", errors="replace")
if len(text.strip()) < 50:
continue
date_match = re.match(r"(\d{4}-\d{2}-\d{2})", diary_path.stem)
if not date_match:
continue
date_str = date_match.group(1)
# Skip if content hasn't changed
prev_size = state.get(diary_path.name, {}).get("size", 0)
curr_size = len(text)
if curr_size == prev_size and not force:
continue
now_iso = datetime.now(timezone.utc).isoformat()
drawer_id = f"drawer_diary_{date_str}"
# Extract entities from full day text
entities = _extract_entities_for_metadata(text)
# UPSERT the day's drawer (full verbatim, replaces as day grows)
drawer_meta = {
"date": date_str,
"wing": wing,
"room": "daily",
"source_file": str(diary_path),
"source_session": "daily_diary",
"filed_at": now_iso,
}
if entities:
drawer_meta["entities"] = entities
drawers_col.upsert(
documents=[text],
ids=[drawer_id],
metadatas=[drawer_meta],
)
# Split into entries and find new ones
entries = _split_entries(text)
prev_entry_count = state.get(diary_path.name, {}).get("entry_count", 0)
new_entries = entries[prev_entry_count:] if not force else entries
if new_entries:
# Build closet lines from new entries
all_lines = []
for header, body in new_entries:
entry_text = f"{header}\n{body}"
entry_lines = build_closet_lines(
str(diary_path), [drawer_id], entry_text, wing, "daily"
)
all_lines.extend(entry_lines)
if all_lines:
closet_id_base = f"closet_diary_{date_str}"
closet_meta = {
"date": date_str,
"wing": wing,
"room": "daily",
"source_file": str(diary_path),
"filed_at": now_iso,
}
if entities:
closet_meta["entities"] = entities
n = upsert_closet_lines(
closets_col, closet_id_base, all_lines, closet_meta
)
closets_created += n
state[diary_path.name] = {
"size": curr_size,
"entry_count": len(entries),
"ingested_at": now_iso,
}
days_updated += 1
state_file.write_text(json.dumps(state, indent=2))
if days_updated:
print(f"Diary: {days_updated} days updated, {closets_created} new closets")
return {"days_updated": days_updated, "closets_created": closets_created}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Ingest daily summaries into the palace")
parser.add_argument("--dir", required=True, help="Path to daily_summaries directory")
parser.add_argument("--palace", default=os.path.expanduser("~/.mempalace/palace"))
parser.add_argument("--wing", default="diary")
parser.add_argument("--force", action="store_true")
args = parser.parse_args()
ingest_diaries(args.dir, args.palace, wing=args.wing, force=args.force)