From 9b99c136ee13c1dc97f3c20bb2f59f4464d7284d Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:55:25 -0700 Subject: [PATCH 1/8] fix: strip system tags, hook output, and Claude UI chrome from drawers normalize.py now strips before filing: - , , tags - , , tags - Hook status messages (CURRENT TIME, Checking verified facts, etc.) - Claude Code UI chrome (ctrl+o to expand, progress bars, etc.) - Collapsed runs of blank lines This noise was going straight into drawers, wasting storage space and polluting search results. strip_noise() runs on all normalized output regardless of input format (JSONL, JSON, plain text). 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/normalize.py | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index e599df9..256a5e9 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -16,10 +16,54 @@ No API key. No internet. Everything local. import json import os +import re from pathlib import Path from typing import Optional +# ─── Noise stripping ───────────────────────────────────────────────────── +# Claude Code and other tools inject system tags, hook output, UI chrome, +# and tool-call JSON into transcripts. These waste drawer space and pollute +# search results. Strip them before filing. + +_NOISE_TAG_PATTERNS = [ + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), +] + +_NOISE_STRINGS = [ + "CURRENT TIME:", + "VERIFIED FACTS (do not contradict)", + "AGENT SPECIALIZATION:", + "Checking verified facts...", + "Injecting timestamp...", + "Starting background pipeline...", + "Checking emotional weights...", + "Auto-save reminder...", + "Checking pipeline...", + "MemPalace auto-save checkpoint.", +] + + +def strip_noise(text: str) -> str: + """Remove system tags, hook output, and Claude Code UI chrome from text.""" + for pat in _NOISE_TAG_PATTERNS: + text = pat.sub("", text) + for noise in _NOISE_STRINGS: + text = text.replace(noise, "") + # Strip Claude Code UI chrome + text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) + text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) + text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) + # Collapse runs of blank lines + text = re.sub(r"\n{4,}", "\n\n\n", text) + return text.strip() + + def normalize(filepath: str) -> str: """ Load a file and normalize to transcript format if it's a chat export. @@ -40,19 +84,23 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through + # Already has > markers — pass through (strip noise but preserve trailing newline) lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - return content + cleaned = strip_noise(content) + # Preserve trailing newline if original had one + if content.endswith("\n") and not cleaned.endswith("\n"): + cleaned += "\n" + return cleaned # Try JSON normalization ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return normalized + return strip_noise(normalized) - return content + return strip_noise(content) def _try_normalize_json(content: str) -> Optional[str]: From d3d7184f4e885f64520d80971cbc41285d947e5f Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:33:48 -0700 Subject: [PATCH 2/8] =?UTF-8?q?feat:=20add=20closet=20layer=20=E2=80=94=20?= =?UTF-8?q?searchable=20index=20pointing=20to=20drawers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The closet architecture was always part of MemPalace's design but never shipped in the public codebase. This adds it. Palace now has TWO collections: - mempalace_drawers — full verbatim content (unchanged) - mempalace_closets — compact AAAK-style index entries How it works: - When mining, each file gets a closet alongside its drawers - Closet contains extracted topics, entities, quotes as pointers - Closets pack up to 1500 chars, topics never split mid-entry - Search hits closets first (fast, small), then hydrates the full drawer content for matching files - Falls back to direct drawer search if no closets exist yet Files changed: - palace.py: get_closets_collection(), build_closet_text(), upsert_closet(), CLOSET_CHAR_LIMIT - miner.py: process_file() now creates closets after drawers - searcher.py: search_memories() tries closet-first search, hydrates drawers, falls back to direct search Backwards compatible — existing palaces without closets continue to work via the fallback path. Closets are created on next mine. 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 25 ++++++++++++++- mempalace/palace.py | 62 ++++++++++++++++++++++++++++++++++++ mempalace/searcher.py | 73 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 801ed7e..8170362 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -15,7 +15,10 @@ from pathlib import Path from datetime import datetime from collections import defaultdict -from .palace import SKIP_DIRS, get_collection, file_already_mined, mine_lock +from .palace import ( + SKIP_DIRS, get_collection, get_closets_collection, + file_already_mined, mine_lock, build_closet_text, upsert_closet, +) READABLE_EXTENSIONS = { ".txt", @@ -410,6 +413,7 @@ def process_file( rooms: list, agent: str, dry_run: bool, + closets_col=None, ) -> tuple: """Read, chunk, route, and file one file. Returns (drawer_count, room_name).""" @@ -466,6 +470,22 @@ def process_file( if added: drawers_added += 1 + # Build closet — the searchable index pointing to these drawers + if closets_col and drawers_added > 0: + drawer_ids = [ + f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" + for c in chunks + ] + closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) + closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet(closets_col, closet_id, closet_text, { + "wing": wing, + "room": room, + "source_file": source_file, + "drawer_count": drawers_added, + "filed_at": datetime.now().isoformat(), + }) + return drawers_added, room @@ -586,8 +606,10 @@ def mine( if not dry_run: collection = get_collection(palace_path) + closets_col = get_closets_collection(palace_path) else: collection = None + closets_col = None total_drawers = 0 files_skipped = 0 @@ -602,6 +624,7 @@ def mine( rooms=rooms, agent=agent, dry_run=dry_run, + closets_col=closets_col, ) if drawers == 0 and not dry_run: files_skipped += 1 diff --git a/mempalace/palace.py b/mempalace/palace.py index ed5382a..ef58a06 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -52,6 +52,68 @@ def get_collection( ) +def get_closets_collection(palace_path: str, create: bool = True): + """Get the closets collection — the searchable index layer.""" + return get_collection(palace_path, collection_name="mempalace_closets", create=create) + + +CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one + + +def build_closet_text(source_file, drawer_ids, content, wing, room): + """Build a compact closet entry from drawer content. + + Extracts topics, names, and key quotes into an AAAK-style pointer + that tells the searcher which drawers to open. + """ + import re + # Extract proper nouns (capitalized words, 2+ occurrences) + words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) + word_freq = {} + for w in words: + word_freq[w] = word_freq.get(w, 0) + 1 + entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + + # Extract key phrases + topics = [] + for pattern in [ + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + ]: + topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + + # Extract first quote + quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) + quote = quotes[0] if quotes else "" + + # Build pointer lines + entity_str = ";".join(entities[:5]) if entities else "" + lines = [] + for topic in topics: + pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" + lines.append(pointer) + if quote: + lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + if not lines: + lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + + return "\n".join(lines) + + +def upsert_closet(closets_col, closet_id, closet_text, metadata): + """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids"): + old_text = existing["documents"][0] + if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: + closet_text = old_text + "\n" + closet_text + # else: start fresh — old closet was full + except Exception: + pass + closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) + + @contextlib.contextmanager def mine_lock(source_file: str): """Cross-platform file lock for mine operations. diff --git a/mempalace/searcher.py b/mempalace/searcher.py index bc70c1d..70fd615 100644 --- a/mempalace/searcher.py +++ b/mempalace/searcher.py @@ -9,7 +9,7 @@ Returns verbatim text — the actual words, never summaries. import logging from pathlib import Path -from .palace import get_collection +from .palace import get_collection, get_closets_collection logger = logging.getLogger("mempalace_mcp") @@ -117,7 +117,7 @@ def search_memories( 0.0 disables filtering. Typical useful range: 0.3–1.0. """ try: - col = get_collection(palace_path, create=False) + drawers_col = get_collection(palace_path, create=False) except Exception as e: logger.error("No palace found at %s: %s", palace_path, e) return { @@ -127,6 +127,73 @@ def search_memories( where = build_where_filter(wing, room) + # Try closet-first search: search the compact index, then hydrate drawers + closet_hits = [] + try: + closets_col = get_closets_collection(palace_path, create=False) + ckwargs = { + "query_texts": [query], + "n_results": n_results * 2, # over-fetch closets to find best drawers + "include": ["documents", "metadatas", "distances"], + } + if where: + ckwargs["where"] = where + closet_results = closets_col.query(**ckwargs) + if closet_results["documents"][0]: + closet_hits = list(zip( + closet_results["documents"][0], + closet_results["metadatas"][0], + closet_results["distances"][0], + )) + except Exception: + pass # no closets yet — fall through to direct drawer search + + # If closets found results, hydrate the referenced drawers + if closet_hits: + import re + seen_sources = set() + hits = [] + for closet_doc, closet_meta, closet_dist in closet_hits: + source = closet_meta.get("source_file", "") + if source in seen_sources: + continue + seen_sources.add(source) + + # Find drawers for this source file + try: + drawer_results = drawers_col.get( + where={"source_file": source}, + include=["documents", "metadatas"], + ) + if drawer_results.get("ids"): + # Combine all drawer content for this file + full_text = "\n\n".join(drawer_results["documents"]) + meta = drawer_results["metadatas"][0] + hits.append({ + "text": full_text, + "wing": meta.get("wing", "unknown"), + "room": meta.get("room", "unknown"), + "source_file": Path(source).name, + "similarity": round(max(0.0, 1 - closet_dist), 3), + "distance": round(closet_dist, 4), + "matched_via": "closet", + "closet_preview": closet_doc[:200], + }) + except Exception: + pass + + if len(hits) >= n_results: + break + + if hits: + return { + "query": query, + "filters": {"wing": wing, "room": room}, + "total_before_filter": len(closet_hits), + "results": hits, + } + + # Fallback: direct drawer search (no closets yet, or closets empty) try: kwargs = { "query_texts": [query], @@ -136,7 +203,7 @@ def search_memories( if where: kwargs["where"] = where - results = col.query(**kwargs) + results = drawers_col.query(**kwargs) except Exception as e: return {"error": f"Search error: {e}"} From 124f5bf7ba5eec986c3d26fbbc66d6f0584ef62a Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:40:58 -0700 Subject: [PATCH 3/8] fix: enforce atomic topics in closets, extract richer pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - upsert_closet replaced by upsert_closet_lines: checks each topic line individually against CLOSET_CHAR_LIMIT. If adding one line WHOLE would exceed the limit, starts a new closet. Never splits mid-topic. - build_closet_lines returns a list of atomic lines (not joined text) - Richer extraction: section headers, more action verbs, up to 3 quotes, up to 12 topics per file - Each line is complete: topic|entities|→drawer_refs Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/miner.py | 9 ++-- mempalace/palace.py | 113 ++++++++++++++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 35 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 8170362..37e507a 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -17,7 +17,7 @@ from collections import defaultdict from .palace import ( SKIP_DIRS, get_collection, get_closets_collection, - file_already_mined, mine_lock, build_closet_text, upsert_closet, + file_already_mined, mine_lock, build_closet_lines, upsert_closet_lines, ) READABLE_EXTENSIONS = { @@ -471,14 +471,15 @@ def process_file( drawers_added += 1 # Build closet — the searchable index pointing to these drawers + # Each topic line is atomic — never split across closets if closets_col and drawers_added > 0: drawer_ids = [ f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}" for c in chunks ] - closet_text = build_closet_text(source_file, drawer_ids, content, wing, room) - closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" - upsert_closet(closets_col, closet_id, closet_text, { + closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room) + closet_id_base = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}" + upsert_closet_lines(closets_col, closet_id_base, closet_lines, { "wing": wing, "room": room, "source_file": source_file, diff --git a/mempalace/palace.py b/mempalace/palace.py index ef58a06..9bb08a5 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -60,58 +60,109 @@ def get_closets_collection(palace_path: str, create: bool = True): CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one -def build_closet_text(source_file, drawer_ids, content, wing, room): - """Build a compact closet entry from drawer content. +def build_closet_lines(source_file, drawer_ids, content, wing, room): + """Build compact closet pointer lines from drawer content. - Extracts topics, names, and key quotes into an AAAK-style pointer - that tells the searcher which drawers to open. + Returns a LIST of lines (not joined). Each line is one complete topic + pointer — never split across closets. + + Format: topic|entities|→drawer_ids """ import re + from pathlib import Path + + drawer_ref = ",".join(drawer_ids[:3]) + # Extract proper nouns (capitalized words, 2+ occurrences) words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000]) word_freq = {} for w in words: word_freq[w] = word_freq.get(w, 0) + 1 - entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5] + entities = sorted( + [w for w, c in word_freq.items() if c >= 2], + key=lambda w: -word_freq[w], + )[:5] + entity_str = ";".join(entities) if entities else "" - # Extract key phrases + # Extract key phrases — action verbs + context topics = [] for pattern in [ - r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}", + r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}", ]: topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE)) - topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8] + # Also grab section headers if present + for header in re.findall(r"^#{1,3}\s+(.{5,60})$", content[:5000], re.MULTILINE): + topics.append(header.strip()) + # Dedupe preserving order + topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12] - # Extract first quote - quotes = re.findall(r'"([^"]{15,100})"', content[:5000]) - quote = quotes[0] if quotes else "" + # Extract quotes + quotes = re.findall(r'"([^"]{15,150})"', content[:5000]) - # Build pointer lines - entity_str = ";".join(entities[:5]) if entities else "" + # Build pointer lines — each one is atomic, never split lines = [] for topic in topics: - pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}" - lines.append(pointer) - if quote: - lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}') + lines.append(f"{topic}|{entity_str}|→{drawer_ref}") + for quote in quotes[:3]: + lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}') + + # Always have at least one line if not lines: - lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}") + name = Path(source_file).stem[:40] + lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}") - return "\n".join(lines) + return lines -def upsert_closet(closets_col, closet_id, closet_text, metadata): - """Add or update a closet. Respects CLOSET_CHAR_LIMIT.""" - try: - existing = closets_col.get(ids=[closet_id]) - if existing.get("ids"): - old_text = existing["documents"][0] - if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT: - closet_text = old_text + "\n" + closet_text - # else: start fresh — old closet was full - except Exception: - pass - closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata]) +def upsert_closet_lines(closets_col, closet_id_base, lines, metadata): + """Add topic lines to closets. Never splits a topic mid-line. + + If adding a line WHOLE would exceed CLOSET_CHAR_LIMIT, a new closet + is created. Some closets may have less than 1500 chars — that's fine. + Every topic is complete and readable. + + Returns the number of closets written. + """ + closet_num = 1 + current_lines = [] + current_chars = 0 + closets_written = 0 + + def _flush(): + nonlocal closets_written + if not current_lines: + return + closet_id = f"{closet_id_base}_{closet_num:02d}" + text = "\n".join(current_lines) + + # Check if closet already has content — append if room + try: + existing = closets_col.get(ids=[closet_id]) + if existing.get("ids") and existing["documents"][0]: + old = existing["documents"][0] + if len(old) + len(text) + 1 <= CLOSET_CHAR_LIMIT: + text = old + "\n" + text + except Exception: + pass + + closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata]) + closets_written += 1 + + for line in lines: + line_len = len(line) + # Would this line fit whole in the current closet? + if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT: + # Doesn't fit — flush current closet, start new one + _flush() + closet_num += 1 + current_lines = [] + current_chars = 0 + + current_lines.append(line) + current_chars += line_len + 1 # +1 for newline + + _flush() + return closets_written @contextlib.contextmanager From ee60cad652d89b2302d65f797f8cf2f997bccc87 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 07:38:43 -0300 Subject: [PATCH 4/8] =?UTF-8?q?docs:=20add=20CLOSETS.md=20=E2=80=94=20clos?= =?UTF-8?q?et=20layer=20overview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-picked the docs portion of 67e4ac6 to accompany the closet feature. Test coverage for closets is omnibus with tests for entity metadata and BM25 (see PR targeting those features) and will land together in a follow-up. Co-Authored-By: MSL <232237854+milla-jovovich@users.noreply.github.com> --- docs/CLOSETS.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 docs/CLOSETS.md diff --git a/docs/CLOSETS.md b/docs/CLOSETS.md new file mode 100644 index 0000000..c4e9615 --- /dev/null +++ b/docs/CLOSETS.md @@ -0,0 +1,79 @@ +# Closets — The Searchable Index Layer + +## What closets are + +Drawers hold your verbatim content. Closets are the index — compact pointers that tell the searcher which drawers to open. + +``` +CLOSET: "built auth system|Ben;Igor|→drawer_api_auth_a1b2c3" + ↑ topic ↑ entities ↑ points to this drawer +``` + +An agent searching "who built the auth?" hits the closet first (fast scan of short text), then opens the referenced drawer to get the full verbatim content. + +## Lifecycle + +### When are closets created? + +Closets are created during `mempalace mine`. For each file mined: +1. Content is chunked into drawers (verbatim, ~800 chars each) +2. Topics, entities, and quotes are extracted from the content +3. A closet is created with pointer lines to those drawers + +### What's inside a closet? + +Each line is one atomic topic pointer: +``` +topic description|entity1;entity2|→drawer_id_1,drawer_id_2 +"verbatim quote from the content"|entity1|→drawer_id_3 +``` + +Topics are never split across closets. If adding a topic would exceed 1,500 characters, a new closet is created. + +### When do closets update? + +When a file is re-mined (content changed), its drawers are replaced and new closets are built from the fresh content. The old closet content is replaced via upsert. + +### What about stale topics? + +If a file's content changes and a topic no longer exists, the closet is rebuilt entirely from the new content — stale topics are gone. Closets are tied to source files, not to individual topics. + +If you add content to an existing file (e.g., a daily diary growing throughout the day), new topics are appended to the existing closet until the 1,500-char limit, then a new closet is created. + +### Do closets survive palace rebuilds? + +Closets are stored in the `mempalace_closets` ChromaDB collection alongside `mempalace_drawers`. If you delete and rebuild the palace, closets are recreated during the next `mempalace mine`. + +## How search uses closets + +``` +Query → search mempalace_closets (fast, small documents) + ↓ + top closet hits → extract drawer IDs from pointer lines + ↓ + fetch drawers from mempalace_drawers (full verbatim content) + ↓ + BM25 hybrid re-rank (keyword match + vector similarity) + ↓ + return results to user +``` + +If no closets exist (palace created before this feature), search falls back to direct drawer search. Closets are created on next mine. + +## Limits + +| Setting | Value | Reason | +|---------|-------|--------| +| Max closet size | 1,500 chars | Leaves buffer under ChromaDB's working limit | +| Max topics per file | 12 | Keeps closets focused | +| Max quotes per file | 3 | Most relevant only | +| Max entities per pointer | 5 | Top names by frequency | +| Max response chars | 10,000 | Prevents hydration blowup on large files | + +## For developers + +Closet functions live in `mempalace/palace.py`: +- `get_closets_collection()` — get the closets ChromaDB collection +- `build_closet_lines()` — extract topics/entities/quotes into pointer lines +- `upsert_closet_lines()` — write lines to closets respecting the char limit +- `CLOSET_CHAR_LIMIT` — the 1,500 char limit constant From 69d6e2f7f3a6703396b10e39a790b8aa5e193a0c Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:46:27 -0300 Subject: [PATCH 5/8] fix: sync version.py to 3.2.0 Commit 6614b9b bumped pyproject.toml to 3.2.0 but missed mempalace/version.py, breaking test_version_consistency on every PR's CI. This syncs them. --- mempalace/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mempalace/version.py b/mempalace/version.py index 1eb21a2..45176bc 100644 --- a/mempalace/version.py +++ b/mempalace/version.py @@ -1,3 +1,3 @@ """Single source of truth for the MemPalace package version.""" -__version__ = "3.1.0" +__version__ = "3.2.0" From ca2598a9f69247429c367217eaf167c9d9c824da Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:11:03 -0300 Subject: [PATCH 6/8] fix(normalize): make strip_noise verbatim-safe and scope it to Claude Code JSONL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial strip_noise() regressed on three fronts when audited against adversarial user content — each verified with executable repros against the cherry-picked code: 1. `.*?` with re.DOTALL span-ate across messages: one stray unclosed anywhere in a session merged with the next closing tag, silently deleting everything between them (including full assistant replies). 2. `.*\(ctrl\+o to expand\).*\n?` nuked entire lines of user prose whenever a user happened to document the TUI shortcut. 3. `Ran \d+ (?:stop|pre|post)\s*hook.*` with IGNORECASE ate the second sentence from "our CI has a stop hook ... Ran 2 stop hooks last week" — legitimate user commentary. These are unambiguous violations of the project's "Verbatim always" design principle. Fixes: - All tag patterns are now line-anchored (`(?m)^(?:> )?`) and their body forbids crossing a blank line (`(?:(?!\n\s*\n)[\s\S])*?`), so a dangling open tag cannot eat neighboring messages. - `_NOISE_LINE_PREFIXES` are line-anchored and case-sensitive — user prose mentioning "CURRENT TIME:" mid-sentence is preserved. - Hook-run chrome requires `(?m)^`, explicit hook names (Stop, PreCompact, PreToolUse, etc.), and no IGNORECASE. - "… +N lines" is line-anchored. - "(ctrl+o to expand)" only matches Claude Code's actual collapsed- output chrome shape `[N tokens] (ctrl+o to expand)`; a bare parenthetical in user prose stays intact. Scope: - `strip_noise()` is no longer called on every normalization path. Only `_try_claude_code_jsonl` invokes it, per-extracted-message — so Claude.ai exports, ChatGPT exports, Slack JSON, Codex JSONL, and plain text with `>` markers pass through fully verbatim. Per-message application also makes span-eating structurally impossible. Tests: - 15 new tests in test_normalize.py pin the boundary: 6 guard user content that must survive (each of the adversarial repros), 9 assert real system chrome is still stripped. All pass; full suite 702 pass (2 failures are the unrelated pre-existing version.py bug, cleared by #820). Known limitation (not fixed here): convo_miner.py does not delete drawers on re-mine, so transcripts mined before this PR keep noise- filled drawers until the user manually erases + re-mines. Proper fix needs a schema-version field on drawer metadata + re-mine trigger — out of scope for this PR. --- mempalace/normalize.py | 101 +++++++++++++++++++-------- tests/test_normalize.py | 146 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 29 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index 256a5e9..f2b8173 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -22,20 +22,40 @@ from typing import Optional # ─── Noise stripping ───────────────────────────────────────────────────── -# Claude Code and other tools inject system tags, hook output, UI chrome, -# and tool-call JSON into transcripts. These waste drawer space and pollute -# search results. Strip them before filing. +# Claude Code and other tools inject system tags, hook output, and UI chrome +# into transcripts. These waste drawer space and pollute search results. +# +# Verbatim is sacred — every pattern here is anchored to line boundaries and +# refuses to cross blank lines, so a stray unclosed tag in one message can +# never eat content from neighboring messages. When in doubt, leave text +# alone. -_NOISE_TAG_PATTERNS = [ - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), -] +_NOISE_TAGS = ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", +) -_NOISE_STRINGS = [ + +def _tag_pattern(name: str) -> "re.Pattern[str]": + # Opening tag must begin a line (optionally after a `> ` blockquote marker, + # since _messages_to_transcript prefixes lines with `> `). Body is lazy but + # forbidden from crossing a blank line, so a dangling open tag can't span + # multiple messages. Closing tag eats optional trailing whitespace + newline. + return re.compile( + rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"[ \t]*\n?" + ) + + +_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS] + +# Strings that identify an entire noise line when found at its start. +# Matched case-sensitively and anchored to line-start so user prose mentioning +# e.g. "current time:" in a sentence is untouched. +_NOISE_LINE_PREFIXES = ( "CURRENT TIME:", "VERIFIED FACTS (do not contradict)", "AGENT SPECIALIZATION:", @@ -46,20 +66,39 @@ _NOISE_STRINGS = [ "Auto-save reminder...", "Checking pipeline...", "MemPalace auto-save checkpoint.", +) + +_NOISE_LINE_PATTERNS = [ + re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES ] +# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook". +# Line-anchored, case-sensitive, explicit hook names — prose like +# "our CI has a stop hook" stays intact. +_HOOK_LINE_RE = re.compile( + r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?" +) + +# "… +N lines" collapsed-output marker, line-anchored. +_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?") + def strip_noise(text: str) -> str: - """Remove system tags, hook output, and Claude Code UI chrome from text.""" + """Remove system tags, hook output, and Claude Code UI chrome from text. + + All patterns are line-anchored. User prose that happens to mention these + strings inline (e.g., documenting them) is preserved verbatim. + """ for pat in _NOISE_TAG_PATTERNS: text = pat.sub("", text) - for noise in _NOISE_STRINGS: - text = text.replace(noise, "") - # Strip Claude Code UI chrome - text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) - text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) - text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) - # Collapse runs of blank lines + for pat in _NOISE_LINE_PATTERNS: + text = pat.sub("", text) + text = _HOOK_LINE_RE.sub("", text) + text = _COLLAPSED_LINES_RE.sub("", text) + # Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)". + # Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact. + text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text) + # Collapse runs of blank lines created by the removals text = re.sub(r"\n{4,}", "\n\n\n", text) return text.strip() @@ -84,23 +123,21 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through (strip noise but preserve trailing newline) + # Already has > markers — pass through unchanged. lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - cleaned = strip_noise(content) - # Preserve trailing newline if original had one - if content.endswith("\n") and not cleaned.endswith("\n"): - cleaned += "\n" - return cleaned + return content - # Try JSON normalization + # Try JSON normalization. strip_noise is applied inside the Claude Code + # JSONL parser (the only format that injects system tags/hook chrome); + # other formats pass through verbatim. ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return strip_noise(normalized) + return normalized - return strip_noise(content) + return content def _try_normalize_json(content: str) -> Optional[str]: @@ -160,6 +197,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content ) text = _extract_content(msg_content, tool_use_map=tool_use_map) + # Strip Claude Code system-injected noise per message, never across + # message boundaries — prevents span-eating. + if text: + text = strip_noise(text) if text: if is_tool_only and messages and messages[-1][0] == "assistant": # Append tool results to the previous assistant message @@ -169,6 +210,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: messages.append(("user", text)) elif msg_type == "assistant": text = _extract_content(msg_content, tool_use_map=tool_use_map) + if text: + text = strip_noise(text) if text: # If previous message is also assistant (multi-turn tool loop), # merge into the same assistant turn diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 7f0652a..53fc933 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -13,6 +13,7 @@ from mempalace.normalize import ( _try_normalize_json, _try_slack_json, normalize, + strip_noise, ) @@ -1048,3 +1049,148 @@ def test_normalize_rejects_large_file(): assert False, "Should have raised IOError" except IOError as e: assert "too large" in str(e).lower() + + +# ── strip_noise() — verbatim-safety boundary tests ───────────────────── +# +# The "Verbatim always" design principle requires that we never delete +# user-authored text. These tests pin down the boundary between system +# noise (which we strip) and user prose that happens to mention the same +# strings (which must survive untouched). + + +class TestStripNoisePreservesUserContent: + """User prose that mentions noise strings inline must be preserved.""" + + def test_user_discusses_stop_hook_in_prose(self): + # Regression: original regex with IGNORECASE + `.*\n?` ate the second + # sentence from real user commentary. + text = ( + "> User:\n" + "> Our CI has a stop hook that rejects merges after 5pm. " + "Ran 2 stop hooks last week.\n" + "> Assistant:\n" + "> Got it." + ) + assert strip_noise(text) == text.strip() + + def test_user_mentions_system_reminder_inline(self): + # Inline tags inside user prose (e.g. documenting + # Claude Code behavior) must not be stripped. + text = ( + "> User:\n" + "> Here is what Claude Code emits: " + "Auto-save reminder..." + " — I want to ignore it." + ) + assert strip_noise(text) == text.strip() + + def test_ctrl_o_hint_in_prose_preserved(self): + # Regression: original `.*\(ctrl\+o to expand\).*\n?` nuked the whole + # line whenever a user documented the TUI shortcut. + text = ( + "> User:\n" + "> In the TUI you hit (ctrl+o to expand) to see more. " + "That is the shortcut I want to document." + ) + assert strip_noise(text) == text.strip() + + def test_current_time_inline_in_prose(self): + text = "> User:\n> At CURRENT TIME: the meeting starts, not before." + assert strip_noise(text) == text.strip() + + def test_plus_n_lines_marker_inline(self): + text = "> User:\n> The log showed … +50 lines of stack trace, useful." + assert strip_noise(text) == text.strip() + + def test_dangling_open_tag_does_not_span_messages(self): + # THE span-eating bug: a stray unclosed in one + # message must NOT merge with a closing tag in another message and + # silently delete everything in between. + text = ( + "> User 1: normal content A\n" + "> Assistant: reply\n" + "> User 2: more content tail" + ) + out = strip_noise(text) + assert "Assistant: reply" in out + assert "User 2: more content" in out + assert "User 1: normal content" in out + + +class TestStripNoiseRemovesSystemChrome: + """System-injected noise with standalone/line-anchored shape must be stripped.""" + + def test_strips_line_anchored_system_reminder_block(self): + text = ( + "> User:\n" + "\n" + "Auto-save reminder...\n" + "\n" + "> Real message." + ) + out = strip_noise(text) + assert "system-reminder" not in out + assert "Auto-save reminder" not in out + assert "Real message." in out + + def test_strips_system_reminder_with_blockquote_prefix(self): + # _messages_to_transcript prefixes lines with "> ", so the line + # anchor must also accept that shape. + text = "> User:\n" "> Injected noise\n" "> Real message." + out = strip_noise(text) + assert "Injected noise" not in out + assert "Real message." in out + + def test_strips_standalone_ran_hook_line(self): + text = "Ran 2 Stop hook\n> User: real content" + out = strip_noise(text) + assert "Ran 2 Stop hook" not in out + assert "real content" in out + + def test_strips_known_hook_names(self): + for hook in ("Stop", "PreCompact", "PreToolUse", "PostToolUse", "UserPromptSubmit"): + text = f"Ran 1 {hook} hook\n> User: content" + assert hook not in strip_noise(text) + + def test_strips_current_time_standalone(self): + text = "CURRENT TIME: 2026-04-13 10:00 UTC\n> User: Hello" + out = strip_noise(text) + assert "CURRENT TIME" not in out + assert "Hello" in out + + def test_strips_collapsed_lines_marker(self): + text = "… +42 lines\n> User: Hello" + out = strip_noise(text) + assert "+42 lines" not in out + assert "Hello" in out + + def test_strips_token_count_ctrl_o_chrome(self): + # Claude Code's actual collapsed-output chrome: "[N tokens] (ctrl+o to expand)" + text = "> Assistant: some output [5 tokens] (ctrl+o to expand)\n> User: ok" + out = strip_noise(text) + assert "(ctrl+o to expand)" not in out + assert "[5 tokens]" not in out + assert "some output" in out + + def test_strips_each_known_noise_tag(self): + for tag in ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", + ): + text = f"> User:\n<{tag}>junk\n> Real." + out = strip_noise(text) + assert tag not in out, f"{tag} leaked into output" + assert "Real." in out + + def test_collapses_excessive_blank_lines(self): + text = "line one\n\n\n\n\n\nline two" + out = strip_noise(text) + assert "line one" in out + assert "line two" in out + # Should collapse to no more than 3 newlines + assert "\n\n\n\n" not in out From 7e5eeda9a5c22168719067d15af8b2424662f586 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:20:55 -0300 Subject: [PATCH 7/8] feat(normalize): auto-rebuild stale drawers via NORMALIZE_VERSION schema gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the strip_noise improvement only helps new mines. Every user who had already mined Claude Code JSONL sessions would keep their noise-polluted drawers forever, because convo_miner's file_already_mined skip short-circuits before re-processing. Adds a versioned schema gate so upgrades propagate silently: - palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline changes shape (this PR's strip_noise is the v1→v2 bump). - file_already_mined now returns False if the stored normalize_version is missing or less than current, triggering a rebuild on next mine. - Both miners stamp drawers with the current normalize_version. - convo_miner now purges stale drawers before inserting fresh chunks (mirrors miner.py's existing delete+insert), extracted into _file_convo_chunks helper to keep mine_convos under ruff's C901 limit. User experience: upgrade mempalace, run `mempalace mine` as usual, old noisy drawers get silently replaced with clean ones. No erase needed, no "you need to rebuild" changelog footgun. Tests: - test_file_already_mined_returns_false_for_stale_normalize_version — pins the version gate contract for missing/v1/current. - test_add_drawer_stamps_normalize_version — fresh project-miner drawers carry the field. - test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end proof that a pre-v2 palace gets silently cleaned on next mine, with orphan drawers purged and NOT skipped. Existing test_file_already_mined_check_mtime updated to include the new field; all other tests unaffected. --- mempalace/convo_miner.py | 83 ++++++++++++++++++++++------------ mempalace/miner.py | 3 +- mempalace/palace.py | 28 ++++++++++-- tests/test_convo_miner.py | 83 ++++++++++++++++++++++++++++++++++ tests/test_miner.py | 94 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 253 insertions(+), 38 deletions(-) diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py index d406073..663f1a0 100644 --- a/mempalace/convo_miner.py +++ b/mempalace/convo_miner.py @@ -16,7 +16,7 @@ from datetime import datetime from collections import defaultdict from .normalize import normalize -from .palace import SKIP_DIRS, get_collection, file_already_mined +from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection # File types that might contain conversations @@ -51,6 +51,7 @@ def _register_file(collection, source_file: str, wing: str, agent: str): "added_by": agent, "filed_at": datetime.now().isoformat(), "ingest_mode": "registry", + "normalize_version": NORMALIZE_VERSION, } ], ) @@ -272,6 +273,52 @@ def scan_convos(convo_dir: str) -> list: # ============================================================================= +def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode): + """Purge stale drawers for ``source_file`` then upsert fresh chunks. + + Returns (drawers_added, room_counts_delta). + """ + # Purge stale drawers first. When the normalize schema bumps, + # file_already_mined() returns False for pre-v2 drawers and we land + # here — clean them out so the source doesn't end up with a mix of + # old-noise and new-clean drawers. + try: + collection.delete(where={"source_file": source_file}) + except Exception: + pass + + room_counts_delta: dict = defaultdict(int) + drawers_added = 0 + for chunk in chunks: + chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room + if extract_mode == "general": + room_counts_delta[chunk_room] += 1 + drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" + try: + collection.upsert( + documents=[chunk["content"]], + ids=[drawer_id], + metadatas=[ + { + "wing": wing, + "room": chunk_room, + "source_file": source_file, + "chunk_index": chunk["chunk_index"], + "added_by": agent, + "filed_at": datetime.now().isoformat(), + "ingest_mode": "convos", + "extract_mode": extract_mode, + "normalize_version": NORMALIZE_VERSION, + } + ], + ) + drawers_added += 1 + except Exception as e: + if "already exists" not in str(e).lower(): + raise + return drawers_added, room_counts_delta + + def mine_convos( convo_dir: str, palace_path: str, @@ -375,34 +422,12 @@ def mine_convos( if extract_mode != "general": room_counts[room] += 1 - # File each chunk - drawers_added = 0 - for chunk in chunks: - chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room - if extract_mode == "general": - room_counts[chunk_room] += 1 - drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" - try: - collection.upsert( - documents=[chunk["content"]], - ids=[drawer_id], - metadatas=[ - { - "wing": wing, - "room": chunk_room, - "source_file": source_file, - "chunk_index": chunk["chunk_index"], - "added_by": agent, - "filed_at": datetime.now().isoformat(), - "ingest_mode": "convos", - "extract_mode": extract_mode, - } - ], - ) - drawers_added += 1 - except Exception as e: - if "already exists" not in str(e).lower(): - raise + # Purge stale drawers + file fresh chunks. + drawers_added, room_delta = _file_convo_chunks( + collection, source_file, chunks, wing, room, agent, extract_mode + ) + for r, n in room_delta.items(): + room_counts[r] += n total_drawers += drawers_added print(f" ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}") diff --git a/mempalace/miner.py b/mempalace/miner.py index 22c8af3..49e0d25 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -15,7 +15,7 @@ from pathlib import Path from datetime import datetime from collections import defaultdict -from .palace import SKIP_DIRS, get_collection, file_already_mined +from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection READABLE_EXTENSIONS = { ".txt", @@ -381,6 +381,7 @@ def add_drawer( "chunk_index": chunk_index, "added_by": agent, "filed_at": datetime.now().isoformat(), + "normalize_version": NORMALIZE_VERSION, } # Store file mtime so we can detect modifications later. try: diff --git a/mempalace/palace.py b/mempalace/palace.py index 948fecc..9cfb55e 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -36,6 +36,16 @@ SKIP_DIRS = { _DEFAULT_BACKEND = ChromaBackend() +# Schema version for drawer normalization. Bump when the normalization +# pipeline changes in a way that existing drawers should be rebuilt to pick up +# (e.g., new noise-stripping rules). `file_already_mined` treats drawers with +# a missing or stale `normalize_version` as "not mined", so the next mine pass +# silently rebuilds them — users don't need to manually erase + re-mine. +# +# v2 (2026-04): introduced strip_noise() for Claude Code JSONL; previous +# drawers stored system tags / hook chrome verbatim. +NORMALIZE_VERSION = 2 + def get_collection( palace_path: str, @@ -53,16 +63,26 @@ def get_collection( def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool: """Check if a file has already been filed in the palace. - When check_mtime=True (used by project miner), returns False if the file - has been modified since it was last mined, so it gets re-mined. - When check_mtime=False (used by convo miner), just checks existence. + Returns False (so the file gets re-mined) when: + - no drawers exist for this source_file + - the stored `normalize_version` is missing or older than the current + schema (triggers silent rebuild after a normalization upgrade) + - `check_mtime=True` and the file's mtime differs from the stored one + + When check_mtime=True (used by project miner), also re-mines on content + change. When check_mtime=False (used by convo miner), transcripts are + assumed immutable, so only the version gate triggers a rebuild. """ try: results = collection.get(where={"source_file": source_file}, limit=1) if not results.get("ids"): return False + stored_meta = results.get("metadatas", [{}])[0] or {} + # Pre-v2 drawers have no version field — treat them as stale. + stored_version = stored_meta.get("normalize_version", 1) + if stored_version < NORMALIZE_VERSION: + return False if check_mtime: - stored_meta = results.get("metadatas", [{}])[0] stored_mtime = stored_meta.get("source_mtime") if stored_mtime is None: return False diff --git a/tests/test_convo_miner.py b/tests/test_convo_miner.py index f5074b4..166644b 100644 --- a/tests/test_convo_miner.py +++ b/tests/test_convo_miner.py @@ -75,3 +75,86 @@ def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys): assert "Files skipped (already filed): 1" in out2 finally: shutil.rmtree(tmpdir, ignore_errors=True) + + +def test_mine_convos_rebuilds_stale_drawers_after_schema_bump(capsys): + """When stored drawers have an older normalize_version, the next mine + silently purges them and refiles — no manual erase required. + + This is what makes the strip_noise upgrade apply to existing corpora: + users just run `mempalace mine` again and old noise-filled drawers get + replaced with clean ones.""" + from mempalace.palace import NORMALIZE_VERSION + + tmpdir = tempfile.mkdtemp() + try: + convo_path = Path(tmpdir) / "chat.txt" + convo_path.write_text( + "> What is memory?\nMemory is persistence.\n\n" + "> Why does it matter?\nIt enables continuity.\n\n" + "> How do we build it?\nWith structured storage.\n" + ) + palace_path = os.path.join(tmpdir, "palace") + + # First mine — stamps drawers with NORMALIZE_VERSION + mine_convos(tmpdir, palace_path, wing="test") + capsys.readouterr() + + client = chromadb.PersistentClient(path=palace_path) + col = client.get_collection("mempalace_drawers") + resolved = str(Path(tmpdir).resolve() / "chat.txt") + first_pass = col.get(where={"source_file": resolved}) + first_ids = set(first_pass["ids"]) + assert first_ids, "first mine should produce drawers" + for meta in first_pass["metadatas"]: + assert meta.get("normalize_version") == NORMALIZE_VERSION + + # Simulate pre-v2 drawers: rewrite metadata to an older version, + # and replace content with "noise" so we can see it get cleaned up. + stale_metas = [] + for meta in first_pass["metadatas"]: + stale = dict(meta) + stale["normalize_version"] = 1 + stale_metas.append(stale) + col.update( + ids=list(first_pass["ids"]), + documents=["STALE NOISE"] * len(first_pass["ids"]), + metadatas=stale_metas, + ) + # Add an extra orphan drawer that should also be purged. + col.add( + ids=["orphan_drawer"], + documents=["OLD ORPHAN"], + metadatas=[ + { + "wing": "test", + "room": "default", + "source_file": resolved, + "chunk_index": 999, + "normalize_version": 1, + } + ], + ) + del col, client + + # Second mine — version gate should trigger rebuild + mine_convos(tmpdir, palace_path, wing="test") + out = capsys.readouterr().out + assert ( + "Files skipped (already filed): 0" in out + ), "stale drawers should force a rebuild, not a skip" + + client = chromadb.PersistentClient(path=palace_path) + col = client.get_collection("mempalace_drawers") + rebuilt = col.get(where={"source_file": resolved}) + # Orphan is gone + assert "orphan_drawer" not in rebuilt["ids"] + # No stale content survived + assert all("STALE NOISE" not in d for d in rebuilt["documents"]) + assert all("OLD ORPHAN" not in d for d in rebuilt["documents"]) + # All rebuilt drawers carry the current version + for meta in rebuilt["metadatas"]: + assert meta.get("normalize_version") == NORMALIZE_VERSION + del col, client + finally: + shutil.rmtree(tmpdir, ignore_errors=True) diff --git a/tests/test_miner.py b/tests/test_miner.py index ea2f2a9..020d5bd 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -7,7 +7,7 @@ import chromadb import yaml from mempalace.miner import mine, scan_project, status -from mempalace.palace import file_already_mined +from mempalace.palace import NORMALIZE_VERSION, file_already_mined def write_file(path: Path, content: str): @@ -227,11 +227,17 @@ def test_file_already_mined_check_mtime(): assert file_already_mined(col, test_file) is False assert file_already_mined(col, test_file, check_mtime=True) is False - # Add it with mtime + # Add it with mtime + current normalize_version col.add( ids=["d1"], documents=["hello world"], - metadatas=[{"source_file": test_file, "source_mtime": str(mtime)}], + metadatas=[ + { + "source_file": test_file, + "source_mtime": str(mtime), + "normalize_version": NORMALIZE_VERSION, + } + ], ) # Already mined (no mtime check) @@ -253,7 +259,12 @@ def test_file_already_mined_check_mtime(): col.add( ids=["d2"], documents=["other"], - metadatas=[{"source_file": "/fake/no_mtime.txt"}], + metadatas=[ + { + "source_file": "/fake/no_mtime.txt", + "normalize_version": NORMALIZE_VERSION, + } + ], ) assert file_already_mined(col, "/fake/no_mtime.txt", check_mtime=True) is False finally: @@ -296,3 +307,78 @@ def test_status_missing_palace_does_not_create_empty_collection(tmp_path, capsys out = capsys.readouterr().out assert "No palace found" in out assert not palace_path.exists() + + +# ── normalize_version schema gate ─────────────────────────────────────── +# +# When the normalization pipeline changes shape (e.g., strip_noise lands), +# `NORMALIZE_VERSION` is bumped so pre-existing drawers can be silently +# rebuilt on the next mine. These tests pin that contract. + + +def test_file_already_mined_returns_false_for_stale_normalize_version(): + """Pre-v2 drawers (no field, or older integer) must not short-circuit.""" + tmpdir = tempfile.mkdtemp() + try: + palace_path = os.path.join(tmpdir, "palace") + os.makedirs(palace_path) + client = chromadb.PersistentClient(path=palace_path) + col = client.get_or_create_collection("mempalace_drawers") + + # Pre-v2 drawer: no normalize_version field at all + col.add( + ids=["d_old"], + documents=["old"], + metadatas=[{"source_file": "/fake/old.jsonl"}], + ) + assert file_already_mined(col, "/fake/old.jsonl") is False + + # Explicitly older version + col.add( + ids=["d_v1"], + documents=["v1"], + metadatas=[{"source_file": "/fake/v1.jsonl", "normalize_version": 1}], + ) + assert file_already_mined(col, "/fake/v1.jsonl") is False + + # Current version — short-circuits + col.add( + ids=["d_current"], + documents=["cur"], + metadatas=[ + { + "source_file": "/fake/current.jsonl", + "normalize_version": NORMALIZE_VERSION, + } + ], + ) + assert file_already_mined(col, "/fake/current.jsonl") is True + finally: + del col, client + shutil.rmtree(tmpdir, ignore_errors=True) + + +def test_add_drawer_stamps_normalize_version(tmp_path): + """Fresh drawers carry the current schema version so future upgrades work.""" + from mempalace.miner import add_drawer + + palace_path = tmp_path / "palace" + palace_path.mkdir() + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.get_or_create_collection("mempalace_drawers") + try: + added = add_drawer( + collection=col, + wing="test", + room="notes", + content="hello", + source_file=str(tmp_path / "src.md"), + chunk_index=0, + agent="unit", + ) + assert added is True + stored = col.get(limit=1) + meta = stored["metadatas"][0] + assert meta["normalize_version"] == NORMALIZE_VERSION + finally: + del col, client From a3b7988d8791e10877293131e1f7c936e9a8aee1 Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:50:07 -0700 Subject: [PATCH 8/8] =?UTF-8?q?fix:=20stop=20hooks=20from=20making=20agent?= =?UTF-8?q?s=20write=20in=20chat=20=E2=80=94=20save=20tokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The save hook and precompact hook were telling the agent to write diary entries, add drawers, and add KG triples IN THE CHAT WINDOW. Every line written stays in conversation history and retransmits on every subsequent turn — ~$1/session in wasted tokens. Fix: hooks now say "saved in background, no action needed" and use decision: allow instead of block. The agent continues working without interruption. All filing happens via the background pipeline. Also updated hooks README with: - Known limitation: hooks require session restart after install - Updated cost section: zero tokens, background-only Co-Authored-By: Claude Opus 4.6 (1M context) --- hooks/README.md | 6 +++++- hooks/mempal_precompact_hook.sh | 6 +++--- hooks/mempal_save_hook.sh | 11 +++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/hooks/README.md b/hooks/README.md index d5380ef..977b109 100644 --- a/hooks/README.md +++ b/hooks/README.md @@ -133,6 +133,10 @@ Example output: [14:40:01] Session abc123: 18 exchanges, 3 since last save ``` +## Known Limitations + +**Hooks require session restart after install.** Claude Code loads hooks from `settings.json` at session start only. If you run `mempalace init` or manually edit hook config mid-session, the hooks won't fire until you restart Claude Code. This is a Claude Code limitation. + ## Cost -**Zero extra tokens.** The hooks are bash scripts that run locally. They don't call any API. The only "cost" is the AI spending a few seconds organizing memories at each checkpoint — and it's doing that with context it already has loaded. +**Zero extra tokens.** The hooks notify the AI that saves happened in the background — the AI doesn't need to write anything in the chat. All filing is handled automatically. Previous versions asked the AI to write diary entries and drawer content in the chat window, which cost ~$1/session in retransmitted tokens. diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh index 550a813..1c14193 100755 --- a/hooks/mempal_precompact_hook.sh +++ b/hooks/mempal_precompact_hook.sh @@ -68,10 +68,10 @@ if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then python3 -m mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1 fi -# Always block — compaction = save everything +# Notify — compaction is about to happen but filing is handled in background cat << 'HOOKJSON' { - "decision": "block", - "reason": "COMPACTION IMMINENT. Save ALL topics, decisions, quotes, code, and important context from this session to your memory system. Be thorough — after compaction, detailed context will be lost. Organize into appropriate categories. Use verbatim quotes where possible. Save everything, then allow compaction to proceed." + "decision": "allow", + "reason": "MemPalace pre-compaction save. Your full conversation has been saved verbatim in the background — no action needed. Compaction can proceed safely." } HOOKJSON diff --git a/hooks/mempal_save_hook.sh b/hooks/mempal_save_hook.sh index a0e4681..b15d961 100755 --- a/hooks/mempal_save_hook.sh +++ b/hooks/mempal_save_hook.sh @@ -140,12 +140,15 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then python3 -m mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1 & fi - # Block the AI and tell it to save - # The "reason" becomes a system message the AI sees and acts on + # Notify the AI that a checkpoint happened — but do NOT ask it to write + # anything in chat. All filing happens in the background via the pipeline. + # The old version asked the agent to write diary entries, add drawers, and + # add KG triples in the chat window — that cost ~$1/session in retransmitted + # tokens and cluttered the conversation. cat << 'HOOKJSON' { - "decision": "block", - "reason": "AUTO-SAVE checkpoint. Save key topics, decisions, quotes, and code from this session to your memory system. Organize into appropriate categories. Use verbatim quotes where possible. Continue conversation after saving." + "decision": "allow", + "reason": "MemPalace auto-save checkpoint. Your conversation is being saved verbatim in the background — no action needed from you. Continue working." } HOOKJSON else