feat: entity metadata + diary ingest + BM25 hybrid search

Three features that close the gap between the architecture docs
and the actual codebase:

1. Entity metadata on drawers and closets
   - _extract_entities_for_metadata() pulls names from known_entities.json
     + proper nouns appearing 2+ times
   - Stamped as "entities" field in ChromaDB metadata
   - Enables filterable search by person/project name

2. Day-based diary ingest (diary_ingest.py)
   - ONE drawer per day, upserted as the day grows
   - Closets pack topics atomically, never split mid-topic
   - Tracks entry count in state file, only processes new entries
   - Usage: python -m mempalace.diary_ingest --dir ~/summaries

3. BM25 hybrid search in searcher.py
   - _bm25_score() keyword matching complements vector similarity
   - _hybrid_rank() combines both signals (60% vector, 40% BM25)
   - Catches exact name/term matches that embeddings miss
   - Applied to both closet-first and direct drawer search paths

689/689 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
MSL
2026-04-13 01:47:19 -07:00
committed by Igor Lins e Silva
parent ee60cad652
commit f935e85ead
3 changed files with 282 additions and 4 deletions
+62 -2
View File
@@ -2,11 +2,14 @@
"""
searcher.py — Find anything. Exact words.
Semantic search against the palace.
Returns verbatim text — the actual words, never summaries.
Hybrid search: BM25 keyword matching + vector semantic similarity.
Searches closets first (fast index), then hydrates full drawer content.
Falls back to direct drawer search for palaces without closets.
"""
import logging
import math
import re
from pathlib import Path
from .palace import get_collection, get_closets_collection
@@ -18,6 +21,59 @@ class SearchError(Exception):
"""Raised when search cannot proceed (e.g. no palace found)."""
def _bm25_score(query: str, document: str, k1: float = 1.5, b: float = 0.75, avg_dl: float = 500) -> float:
"""Simple BM25 score for a single document against a query.
This is a lightweight keyword-matching signal that complements vector
similarity. It catches exact matches that embeddings might miss
(e.g., specific names, project codes, error messages).
"""
query_terms = set(re.findall(r'\w{2,}', query.lower()))
doc_terms = re.findall(r'\w{2,}', document.lower())
if not query_terms or not doc_terms:
return 0.0
doc_len = len(doc_terms)
term_freq = {}
for t in doc_terms:
term_freq[t] = term_freq.get(t, 0) + 1
score = 0.0
for term in query_terms:
tf = term_freq.get(term, 0)
if tf > 0:
# Simplified IDF — treat each query term as moderately rare
idf = math.log(2.0)
numerator = tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * doc_len / avg_dl)
score += idf * numerator / denominator
return score
def _hybrid_rank(vector_results, query: str, vector_weight: float = 0.6, bm25_weight: float = 0.4):
"""Re-rank results using both vector distance and BM25 keyword score.
Returns results sorted by combined score (higher = better).
"""
if not vector_results:
return vector_results
# Normalize vector distances to 0-1 similarity
max_dist = max(r.get("distance", 1.0) for r in vector_results) or 1.0
for r in vector_results:
vec_sim = max(0.0, 1 - r.get("distance", 1.0) / max(max_dist, 0.001))
bm25 = _bm25_score(query, r.get("text", ""))
# Normalize BM25 to roughly 0-1 range
bm25_norm = min(bm25 / 3.0, 1.0)
r["_hybrid_score"] = vector_weight * vec_sim + bm25_weight * bm25_norm
r["bm25_score"] = round(bm25, 3)
vector_results.sort(key=lambda r: r["_hybrid_score"], reverse=True)
# Clean up internal field
for r in vector_results:
del r["_hybrid_score"]
return vector_results
def build_where_filter(wing: str = None, room: str = None) -> dict:
"""Build ChromaDB where filter for wing/room filtering."""
if wing and room:
@@ -186,6 +242,8 @@ def search_memories(
break
if hits:
# Re-rank with BM25 hybrid scoring
hits = _hybrid_rank(hits, query)
return {
"query": query,
"filters": {"wing": wing, "room": room},
@@ -227,6 +285,8 @@ def search_memories(
}
)
# Re-rank with BM25 hybrid scoring
hits = _hybrid_rank(hits, query)
return {
"query": query,
"filters": {"wing": wing, "room": room},