From d3d7184f4e885f64520d80971cbc41285d947e5f Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:33:48 -0700 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20add=20closet=20layer=20=E2=80=94=20?= =?UTF-8?q?searchable=20index=20pointing=20to=20drawers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The closet architecture was always part of MemPalace's design but never shipped in the public codebase. This adds it. Palace now has TWO collections: - mempalace_drawers — full verbatim content (unchanged) - mempalace_closets — compact AAAK-style index entries How it works: - When mining, each file gets a closet alongside its drawers - Closet contains extracted topics, entities, quotes as pointers - Closets pack up to 1500 chars, topics never split mid-entry - Search hits closets first (fast, small), then hydrates the full drawer content for matching files - Falls back to direct drawer search if no closets exist yet Files changed: - palace.py: get_closets_collection(), build_closet_text(), upsert_closet(), CLOSET_CHAR_LIMIT - miner.py: process_file() now creates closets after drawers - searcher.py: search_memories() tries closet-first search, hydrates drawers, falls back to direct search Backwards compatible — existing palaces without closets continue to work via the fallback path. Closets are created on next mine. 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 25 ++++++++++++++- mempalace/palace.py | 62 ++++++++++++++++++++++++++++++++++++ mempalace/searcher.py | 73 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 801ed7e..8170362 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -15,7 +15,10 @@ from pathlib import Path from datetime import datetime from collections import defaultdict -from .palace import SKIP_DIRS, get_collection, file_already_mined, mine_lock +from .palace import ( + SKIP_DIRS, get_collection, get_closets_collection, + file_already_mined, mine_lock, build_closet_text, upsert_closet, +) READABLE_EXTENSIONS = { ".txt", @@ -410,6 +413,7 @@ def process_file( rooms: list, agent: str, dry_run: bool, + closets_col=None, ) -> tuple: """Read, chunk, route, and file one file. Returns (drawer_count, room_name).""" @@ -466,6 +470,22 @@ def process_file( if added: drawers_added += 1 + # Build closet — the searchable index pointing to these drawers + if closets_col and drawers_added > 0: + drawer_ids = [ + f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" + for c in chunks + ] + closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) + closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet(closets_col, closet_id, closet_text, { + "wing": wing, + "room": room, + "source_file": source_file, + "drawer_count": drawers_added, + "filed_at": datetime.now().isoformat(), + }) + return drawers_added, room @@ -586,8 +606,10 @@ def mine( if not dry_run: collection = get_collection(palace_path) + closets_col = get_closets_collection(palace_path) else: collection = None + closets_col = None total_drawers = 0 files_skipped = 0 @@ -602,6 +624,7 @@ def mine( rooms=rooms, agent=agent, dry_run=dry_run, + closets_col=closets_col, ) if drawers == 0 and not dry_run: files_skipped += 1 diff --git a/mempalace/palace.py b/mempalace/palace.py index ed5382a..ef58a06 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -52,6 +52,68 @@ def get_collection( ) +def get_closets_collection(palace_path: str, create: bool = True): + """Get the closets collection — the searchable index layer.""" + return get_collection(palace_path, collection_name="mempalace_closets", create=create) + + +CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one + + +def build_closet_text(source_file, drawer_ids, content, wing, room): + """Build a compact closet entry from drawer content. + + Extracts topics, names, and key quotes into an AAAK-style pointer + that tells the searcher which drawers to open. + """ + import re + # Extract proper nouns (capitalized words, 2+ occurrences) + words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) + word_freq = {} + for w in words: + word_freq[w] = word_freq.get(w, 0) + 1 + entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + + # Extract key phrases + topics = [] + for pattern in [ + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + ]: + topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + + # Extract first quote + quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) + quote = quotes[0] if quotes else "" + + # Build pointer lines + entity_str = ";".join(entities[:5]) if entities else "" + lines = [] + for topic in topics: + pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" + lines.append(pointer) + if quote: + lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + if not lines: + lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + + return "\n".join(lines) + + +def upsert_closet(closets_col, closet_id, closet_text, metadata): + """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids"): + old_text = existing["documents"][0] + if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: + closet_text = old_text + "\n" + closet_text + # else: start fresh — old closet was full + except Exception: + pass + closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) + + @contextlib.contextmanager def mine_lock(source_file: str): """Cross-platform file lock for mine operations. diff --git a/mempalace/searcher.py b/mempalace/searcher.py index bc70c1d..70fd615 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -9,7 +9,7 @@ Returns verbatim text — the actual words, never summaries. import logging from pathlib import Path -from .palace import get_collection +from .palace import get_collection, get_closets_collection logger = logging.getLogger("mempalace_mcp") @@ -117,7 +117,7 @@ def search_memories( 0.0 disables filtering. Typical useful range: 0.3–1.0. """ try: - col = get_collection(palace_path, create=False) + drawers_col = get_collection(palace_path, create=False) except Exception as e: logger.error("No palace found at %s: %s", palace_path, e) return { @@ -127,6 +127,73 @@ def search_memories( where = build_where_filter(wing, room) + # Try closet-first search: search the compact index, then hydrate drawers + closet_hits = [] + try: + closets_col = get_closets_collection(palace_path, create=False) + ckwargs = { + "query_texts": [query], + "n_results": n_results * 2, # over-fetch closets to find best drawers + "include": ["documents", "metadatas", "distances"], + } + if where: + ckwargs["where"] = where + closet_results = closets_col.query(**ckwargs) + if closet_results["documents"][0]: + closet_hits = list(zip( + closet_results["documents"][0], + closet_results["metadatas"][0], + closet_results["distances"][0], + )) + except Exception: + pass # no closets yet — fall through to direct drawer search + + # If closets found results, hydrate the referenced drawers + if closet_hits: + import re + seen_sources = set() + hits = [] + for closet_doc, closet_meta, closet_dist in closet_hits: + source = closet_meta.get("source_file", "") + if source in seen_sources: + continue + seen_sources.add(source) + + # Find drawers for this source file + try: + drawer_results = drawers_col.get( + where={"source_file": source}, + include=["documents", "metadatas"], + ) + if drawer_results.get("ids"): + # Combine all drawer content for this file + full_text = "\n\n".join(drawer_results["documents"]) + meta = drawer_results["metadatas"][0] + hits.append({ + "text": full_text, + "wing": meta.get("wing", "unknown"), + "room": meta.get("room", "unknown"), + "source_file": Path(source).name, + "similarity": round(max(0.0, 1 - closet_dist), 3), + "distance": round(closet_dist, 4), + "matched_via": "closet", + "closet_preview": closet_doc[:200], + }) + except Exception: + pass + + if len(hits) >= n_results: + break + + if hits: + return { + "query": query, + "filters": {"wing": wing, "room": room}, + "total_before_filter": len(closet_hits), + "results": hits, + } + + # Fallback: direct drawer search (no closets yet, or closets empty) try: kwargs = { "query_texts": [query], @@ -136,7 +203,7 @@ def search_memories( if where: kwargs["where"] = where - results = col.query(**kwargs) + results = drawers_col.query(**kwargs) except Exception as e: return {"error": f"Search error: {e}"} From 124f5bf7ba5eec986c3d26fbbc66d6f0584ef62a Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:40:58 -0700 Subject: [PATCH 2/3] fix: enforce atomic topics in closets, extract richer pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - upsert_closet replaced by upsert_closet_lines: checks each topic line individually against CLOSET_CHAR_LIMIT. If adding one line WHOLE would exceed the limit, starts a new closet. Never splits mid-topic. - build_closet_lines returns a list of atomic lines (not joined text) - Richer extraction: section headers, more action verbs, up to 3 quotes, up to 12 topics per file - Each line is complete: topic|entities|→drawer_refs Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 9 ++-- mempalace/palace.py | 113 ++++++++++++++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 35 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 8170362..37e507a 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -17,7 +17,7 @@ from collections import defaultdict from .palace import ( SKIP_DIRS, get_collection, get_closets_collection, - file_already_mined, mine_lock, build_closet_text, upsert_closet, + file_already_mined, mine_lock, build_closet_lines, upsert_closet_lines, ) READABLE_EXTENSIONS = { @@ -471,14 +471,15 @@ def process_file( drawers_added += 1 # Build closet — the searchable index pointing to these drawers + # Each topic line is atomic — never split across closets if closets_col and drawers_added > 0: drawer_ids = [ f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" for c in chunks ] - closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) - closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" - upsert_closet(closets_col, closet_id, closet_text, { + closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room) + closet_id_base = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet_lines(closets_col, closet_id_base, closet_lines, { "wing": wing, "room": room, "source_file": source_file, diff --git a/mempalace/palace.py b/mempalace/palace.py index ef58a06..9bb08a5 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -60,58 +60,109 @@ def get_closets_collection(palace_path: str, create: bool = True): CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one -def build_closet_text(source_file, drawer_ids, content, wing, room): - """Build a compact closet entry from drawer content. +def build_closet_lines(source_file, drawer_ids, content, wing, room): + """Build compact closet pointer lines from drawer content. - Extracts topics, names, and key quotes into an AAAK-style pointer - that tells the searcher which drawers to open. + Returns a LIST of lines (not joined). Each line is one complete topic + pointer — never split across closets. + + Format: topic|entities|→drawer_ids """ import re + from pathlib import Path + + drawer_ref = ",".join(drawer_ids[:3]) + # Extract proper nouns (capitalized words, 2+ occurrences) words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) word_freq = {} for w in words: word_freq[w] = word_freq.get(w, 0) + 1 - entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + entities = sorted( + [w for w, c in word_freq.items() if c >= 2], + key=lambda w: -word_freq[w], + )[:5] + entity_str = ";".join(entities) if entities else "" - # Extract key phrases + # Extract key phrases — action verbs + context topics = [] for pattern in [ - r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}", ]: topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) - topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + # Also grab section headers if present + for header in re.findall(r"^#{1,3}\s+(.{5,60})$", content[:5000], re.MULTILINE): + topics.append(header.strip()) + # Dedupe preserving order + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12] - # Extract first quote - quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) - quote = quotes[0] if quotes else "" + # Extract quotes + quotes = re.findall(r'"([^"]{15,150})"', content[:5000]) - # Build pointer lines - entity_str = ";".join(entities[:5]) if entities else "" + # Build pointer lines — each one is atomic, never split lines = [] for topic in topics: - pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" - lines.append(pointer) - if quote: - lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + lines.append(f"{topic}|{entity_str}|→{drawer_ref}") + for quote in quotes[:3]: + lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}') + + # Always have at least one line if not lines: - lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + name = Path(source_file).stem[:40] + lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}") - return "\n".join(lines) + return lines -def upsert_closet(closets_col, closet_id, closet_text, metadata): - """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" - try: - existing = closets_col.get(ids=[closet_id]) - if existing.get("ids"): - old_text = existing["documents"][0] - if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: - closet_text = old_text + "\n" + closet_text - # else: start fresh — old closet was full - except Exception: - pass - closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) +def upsert_closet_lines(closets_col, closet_id_base, lines, metadata): + """Add topic lines to closets. Never splits a topic mid-line. + + If adding a line WHOLE would exceed CLOSET_CHAR_LIMIT, a new closet + is created. Some closets may have less than 1500 chars — that's fine. + Every topic is complete and readable. + + Returns the number of closets written. + """ + closet_num = 1 + current_lines = [] + current_chars = 0 + closets_written = 0 + + def _flush(): + nonlocal closets_written + if not current_lines: + return + closet_id = f"{closet_id_base}_{closet_num:02d}" + text = "\n".join(current_lines) + + # Check if closet already has content — append if room + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids") and existing["documents"][0]: + old = existing["documents"][0] + if len(old) + len(text) + 1 <= CLOSET_CHAR_LIMIT: + text = old + "\n" + text + except Exception: + pass + + closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata]) + closets_written += 1 + + for line in lines: + line_len = len(line) + # Would this line fit whole in the current closet? + if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT: + # Doesn't fit — flush current closet, start new one + _flush() + closet_num += 1 + current_lines = [] + current_chars = 0 + + current_lines.append(line) + current_chars += line_len + 1 # +1 for newline + + _flush() + return closets_written @contextlib.contextmanager From ee60cad652d89b2302d65f797f8cf2f997bccc87 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 07:38:43 -0300 Subject: [PATCH 3/3] =?UTF-8?q?docs:=20add=20CLOSETS.md=20=E2=80=94=20clos?= =?UTF-8?q?et=20layer=20overview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-picked the docs portion of 67e4ac6 to accompany the closet feature. Test coverage for closets is omnibus with tests for entity metadata and BM25 (see PR targeting those features) and will land together in a follow-up. Co-Authored-By: MSL <232237854+milla-jovovich@users.noreply.github.com> --- docs/CLOSETS.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 docs/CLOSETS.md diff --git a/docs/CLOSETS.md b/docs/CLOSETS.md new file mode 100644 index 0000000..c4e9615 --- /dev/null +++ b/docs/CLOSETS.md @@ -0,0 +1,79 @@ +# Closets — The Searchable Index Layer + +## What closets are + +Drawers hold your verbatim content. Closets are the index — compact pointers that tell the searcher which drawers to open. + +``` +CLOSET: "built auth system|Ben;Igor|→drawer_api_auth_a1b2c3" + ↑ topic ↑ entities ↑ points to this drawer +``` + +An agent searching "who built the auth?" hits the closet first (fast scan of short text), then opens the referenced drawer to get the full verbatim content. + +## Lifecycle + +### When are closets created? + +Closets are created during `mempalace mine`. For each file mined: +1. Content is chunked into drawers (verbatim, ~800 chars each) +2. Topics, entities, and quotes are extracted from the content +3. A closet is created with pointer lines to those drawers + +### What's inside a closet? + +Each line is one atomic topic pointer: +``` +topic description|entity1;entity2|→drawer_id_1,drawer_id_2 +"verbatim quote from the content"|entity1|→drawer_id_3 +``` + +Topics are never split across closets. If adding a topic would exceed 1,500 characters, a new closet is created. + +### When do closets update? + +When a file is re-mined (content changed), its drawers are replaced and new closets are built from the fresh content. The old closet content is replaced via upsert. + +### What about stale topics? + +If a file's content changes and a topic no longer exists, the closet is rebuilt entirely from the new content — stale topics are gone. Closets are tied to source files, not to individual topics. + +If you add content to an existing file (e.g., a daily diary growing throughout the day), new topics are appended to the existing closet until the 1,500-char limit, then a new closet is created. + +### Do closets survive palace rebuilds? + +Closets are stored in the `mempalace_closets` ChromaDB collection alongside `mempalace_drawers`. If you delete and rebuild the palace, closets are recreated during the next `mempalace mine`. + +## How search uses closets + +``` +Query → search mempalace_closets (fast, small documents) + ↓ + top closet hits → extract drawer IDs from pointer lines + ↓ + fetch drawers from mempalace_drawers (full verbatim content) + ↓ + BM25 hybrid re-rank (keyword match + vector similarity) + ↓ + return results to user +``` + +If no closets exist (palace created before this feature), search falls back to direct drawer search. Closets are created on next mine. + +## Limits + +| Setting | Value | Reason | +|---------|-------|--------| +| Max closet size | 1,500 chars | Leaves buffer under ChromaDB's working limit | +| Max topics per file | 12 | Keeps closets focused | +| Max quotes per file | 3 | Most relevant only | +| Max entities per pointer | 5 | Top names by frequency | +| Max response chars | 10,000 | Prevents hydration blowup on large files | + +## For developers + +Closet functions live in `mempalace/palace.py`: +- `get_closets_collection()` — get the closets ChromaDB collection +- `build_closet_lines()` — extract topics/entities/quotes into pointer lines +- `upsert_closet_lines()` — write lines to closets respecting the char limit +- `CLOSET_CHAR_LIMIT` — the 1,500 char limit constant