From 9b99c136ee13c1dc97f3c20bb2f59f4464d7284d Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:55:25 -0700 Subject: [PATCH 1/8] fix: strip system tags, hook output, and Claude UI chrome from drawers normalize.py now strips before filing: - , , tags - , , tags - Hook status messages (CURRENT TIME, Checking verified facts, etc.) - Claude Code UI chrome (ctrl+o to expand, progress bars, etc.) - Collapsed runs of blank lines This noise was going straight into drawers, wasting storage space and polluting search results. strip_noise() runs on all normalized output regardless of input format (JSONL, JSON, plain text). 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/normalize.py | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index e599df9..256a5e9 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -16,10 +16,54 @@ No API key. No internet. Everything local. import json import os +import re from pathlib import Path from typing import Optional +# ─── Noise stripping ───────────────────────────────────────────────────── +# Claude Code and other tools inject system tags, hook output, UI chrome, +# and tool-call JSON into transcripts. These waste drawer space and pollute +# search results. Strip them before filing. + +_NOISE_TAG_PATTERNS = [ + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), +] + +_NOISE_STRINGS = [ + "CURRENT TIME:", + "VERIFIED FACTS (do not contradict)", + "AGENT SPECIALIZATION:", + "Checking verified facts...", + "Injecting timestamp...", + "Starting background pipeline...", + "Checking emotional weights...", + "Auto-save reminder...", + "Checking pipeline...", + "MemPalace auto-save checkpoint.", +] + + +def strip_noise(text: str) -> str: + """Remove system tags, hook output, and Claude Code UI chrome from text.""" + for pat in _NOISE_TAG_PATTERNS: + text = pat.sub("", text) + for noise in _NOISE_STRINGS: + text = text.replace(noise, "") + # Strip Claude Code UI chrome + text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) + text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) + text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) + # Collapse runs of blank lines + text = re.sub(r"\n{4,}", "\n\n\n", text) + return text.strip() + + def normalize(filepath: str) -> str: """ Load a file and normalize to transcript format if it's a chat export. @@ -40,19 +84,23 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through + # Already has > markers — pass through (strip noise but preserve trailing newline) lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - return content + cleaned = strip_noise(content) + # Preserve trailing newline if original had one + if content.endswith("\n") and not cleaned.endswith("\n"): + cleaned += "\n" + return cleaned # Try JSON normalization ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return normalized + return strip_noise(normalized) - return content + return strip_noise(content) def _try_normalize_json(content: str) -> Optional[str]: From 69d6e2f7f3a6703396b10e39a790b8aa5e193a0c Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:46:27 -0300 Subject: [PATCH 2/8] fix: sync version.py to 3.2.0 Commit 6614b9b bumped pyproject.toml to 3.2.0 but missed mempalace/version.py, breaking test_version_consistency on every PR's CI. This syncs them. --- mempalace/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mempalace/version.py b/mempalace/version.py index 1eb21a2..45176bc 100644 --- a/mempalace/version.py +++ b/mempalace/version.py @@ -1,3 +1,3 @@ """Single source of truth for the MemPalace package version.""" -__version__ = "3.1.0" +__version__ = "3.2.0" From 09f218cbb2912df53c6ec563c89f08251235a92f Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:48:54 -0300 Subject: [PATCH 3/8] refactor: extract locked filing block to keep mine_convos under C901 Adding the per-file lock + double-checked file_already_mined() in the previous commit pushed mine_convos cyclomatic complexity from 25 to 26, just over ruff's max-complexity threshold. Hoist the locked critical section into _file_chunks_locked() so the outer loop stays within budget. No behavior change. --- mempalace/convo_miner.py | 82 ++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py index f24fa69..6a021ec 100644 --- a/mempalace/convo_miner.py +++ b/mempalace/convo_miner.py @@ -272,6 +272,47 @@ def scan_convos(convo_dir: str) -> list: # ============================================================================= +def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extract_mode): + """Acquire the per-file lock, double-check mined status, and upsert chunks. + + Returns (drawers_added, room_counts_delta, skipped). + """ + room_counts_delta: dict = defaultdict(int) + drawers_added = 0 + with mine_lock(source_file): + # Re-check after lock — another agent may have just finished this file + if file_already_mined(collection, source_file): + return 0, room_counts_delta, True + + for chunk in chunks: + chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room + if extract_mode == "general": + room_counts_delta[chunk_room] += 1 + drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" + try: + collection.upsert( + documents=[chunk["content"]], + ids=[drawer_id], + metadatas=[ + { + "wing": wing, + "room": chunk_room, + "source_file": source_file, + "chunk_index": chunk["chunk_index"], + "added_by": agent, + "filed_at": datetime.now().isoformat(), + "ingest_mode": "convos", + "extract_mode": extract_mode, + } + ], + ) + drawers_added += 1 + except Exception as e: + if "already exists" not in str(e).lower(): + raise + return drawers_added, room_counts_delta, False + + def mine_convos( convo_dir: str, palace_path: str, @@ -376,39 +417,14 @@ def mine_convos( room_counts[room] += 1 # File each chunk — lock to prevent concurrent agents duplicating - drawers_added = 0 - with mine_lock(source_file): - # Re-check after lock — another agent may have just finished this file - if file_already_mined(collection, source_file): - files_skipped += 1 - continue - - for chunk in chunks: - chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room - if extract_mode == "general": - room_counts[chunk_room] += 1 - drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" - try: - collection.upsert( - documents=[chunk["content"]], - ids=[drawer_id], - metadatas=[ - { - "wing": wing, - "room": chunk_room, - "source_file": source_file, - "chunk_index": chunk["chunk_index"], - "added_by": agent, - "filed_at": datetime.now().isoformat(), - "ingest_mode": "convos", - "extract_mode": extract_mode, - } - ], - ) - drawers_added += 1 - except Exception as e: - if "already exists" not in str(e).lower(): - raise + drawers_added, room_delta, skipped = _file_chunks_locked( + collection, source_file, chunks, wing, room, agent, extract_mode + ) + if skipped: + files_skipped += 1 + continue + for r, n in room_delta.items(): + room_counts[r] += n total_drawers += drawers_added print(f" ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}") From 386da51ae54ca09ba491d04cb50ebe00efc73944 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:54:52 -0300 Subject: [PATCH 4/8] style: ruff format mempalace/palace.py Add blank lines after inline imports in mine_lock. Pure formatting. --- mempalace/palace.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mempalace/palace.py b/mempalace/palace.py index ed5382a..7b47f2f 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -69,18 +69,22 @@ def mine_lock(source_file: str): try: if os.name == "nt": import msvcrt + msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1) else: import fcntl + fcntl.flock(lf, fcntl.LOCK_EX) yield finally: try: if os.name == "nt": import msvcrt + msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) else: import fcntl + fcntl.flock(lf, fcntl.LOCK_UN) except Exception: pass From ca2598a9f69247429c367217eaf167c9d9c824da Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:11:03 -0300 Subject: [PATCH 5/8] fix(normalize): make strip_noise verbatim-safe and scope it to Claude Code JSONL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial strip_noise() regressed on three fronts when audited against adversarial user content — each verified with executable repros against the cherry-picked code: 1. `.*?` with re.DOTALL span-ate across messages: one stray unclosed anywhere in a session merged with the next closing tag, silently deleting everything between them (including full assistant replies). 2. `.*\(ctrl\+o to expand\).*\n?` nuked entire lines of user prose whenever a user happened to document the TUI shortcut. 3. `Ran \d+ (?:stop|pre|post)\s*hook.*` with IGNORECASE ate the second sentence from "our CI has a stop hook ... Ran 2 stop hooks last week" — legitimate user commentary. These are unambiguous violations of the project's "Verbatim always" design principle. Fixes: - All tag patterns are now line-anchored (`(?m)^(?:> )?`) and their body forbids crossing a blank line (`(?:(?!\n\s*\n)[\s\S])*?`), so a dangling open tag cannot eat neighboring messages. - `_NOISE_LINE_PREFIXES` are line-anchored and case-sensitive — user prose mentioning "CURRENT TIME:" mid-sentence is preserved. - Hook-run chrome requires `(?m)^`, explicit hook names (Stop, PreCompact, PreToolUse, etc.), and no IGNORECASE. - "… +N lines" is line-anchored. - "(ctrl+o to expand)" only matches Claude Code's actual collapsed- output chrome shape `[N tokens] (ctrl+o to expand)`; a bare parenthetical in user prose stays intact. Scope: - `strip_noise()` is no longer called on every normalization path. Only `_try_claude_code_jsonl` invokes it, per-extracted-message — so Claude.ai exports, ChatGPT exports, Slack JSON, Codex JSONL, and plain text with `>` markers pass through fully verbatim. Per-message application also makes span-eating structurally impossible. Tests: - 15 new tests in test_normalize.py pin the boundary: 6 guard user content that must survive (each of the adversarial repros), 9 assert real system chrome is still stripped. All pass; full suite 702 pass (2 failures are the unrelated pre-existing version.py bug, cleared by #820). Known limitation (not fixed here): convo_miner.py does not delete drawers on re-mine, so transcripts mined before this PR keep noise- filled drawers until the user manually erases + re-mines. Proper fix needs a schema-version field on drawer metadata + re-mine trigger — out of scope for this PR. --- mempalace/normalize.py | 101 +++++++++++++++++++-------- tests/test_normalize.py | 146 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 29 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index 256a5e9..f2b8173 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -22,20 +22,40 @@ from typing import Optional # ─── Noise stripping ───────────────────────────────────────────────────── -# Claude Code and other tools inject system tags, hook output, UI chrome, -# and tool-call JSON into transcripts. These waste drawer space and pollute -# search results. Strip them before filing. +# Claude Code and other tools inject system tags, hook output, and UI chrome +# into transcripts. These waste drawer space and pollute search results. +# +# Verbatim is sacred — every pattern here is anchored to line boundaries and +# refuses to cross blank lines, so a stray unclosed tag in one message can +# never eat content from neighboring messages. When in doubt, leave text +# alone. -_NOISE_TAG_PATTERNS = [ - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), -] +_NOISE_TAGS = ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", +) -_NOISE_STRINGS = [ + +def _tag_pattern(name: str) -> "re.Pattern[str]": + # Opening tag must begin a line (optionally after a `> ` blockquote marker, + # since _messages_to_transcript prefixes lines with `> `). Body is lazy but + # forbidden from crossing a blank line, so a dangling open tag can't span + # multiple messages. Closing tag eats optional trailing whitespace + newline. + return re.compile( + rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"[ \t]*\n?" + ) + + +_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS] + +# Strings that identify an entire noise line when found at its start. +# Matched case-sensitively and anchored to line-start so user prose mentioning +# e.g. "current time:" in a sentence is untouched. +_NOISE_LINE_PREFIXES = ( "CURRENT TIME:", "VERIFIED FACTS (do not contradict)", "AGENT SPECIALIZATION:", @@ -46,20 +66,39 @@ _NOISE_STRINGS = [ "Auto-save reminder...", "Checking pipeline...", "MemPalace auto-save checkpoint.", +) + +_NOISE_LINE_PATTERNS = [ + re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES ] +# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook". +# Line-anchored, case-sensitive, explicit hook names — prose like +# "our CI has a stop hook" stays intact. +_HOOK_LINE_RE = re.compile( + r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?" +) + +# "… +N lines" collapsed-output marker, line-anchored. +_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?") + def strip_noise(text: str) -> str: - """Remove system tags, hook output, and Claude Code UI chrome from text.""" + """Remove system tags, hook output, and Claude Code UI chrome from text. + + All patterns are line-anchored. User prose that happens to mention these + strings inline (e.g., documenting them) is preserved verbatim. + """ for pat in _NOISE_TAG_PATTERNS: text = pat.sub("", text) - for noise in _NOISE_STRINGS: - text = text.replace(noise, "") - # Strip Claude Code UI chrome - text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) - text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) - text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) - # Collapse runs of blank lines + for pat in _NOISE_LINE_PATTERNS: + text = pat.sub("", text) + text = _HOOK_LINE_RE.sub("", text) + text = _COLLAPSED_LINES_RE.sub("", text) + # Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)". + # Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact. + text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text) + # Collapse runs of blank lines created by the removals text = re.sub(r"\n{4,}", "\n\n\n", text) return text.strip() @@ -84,23 +123,21 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through (strip noise but preserve trailing newline) + # Already has > markers — pass through unchanged. lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - cleaned = strip_noise(content) - # Preserve trailing newline if original had one - if content.endswith("\n") and not cleaned.endswith("\n"): - cleaned += "\n" - return cleaned + return content - # Try JSON normalization + # Try JSON normalization. strip_noise is applied inside the Claude Code + # JSONL parser (the only format that injects system tags/hook chrome); + # other formats pass through verbatim. ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return strip_noise(normalized) + return normalized - return strip_noise(content) + return content def _try_normalize_json(content: str) -> Optional[str]: @@ -160,6 +197,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content ) text = _extract_content(msg_content, tool_use_map=tool_use_map) + # Strip Claude Code system-injected noise per message, never across + # message boundaries — prevents span-eating. + if text: + text = strip_noise(text) if text: if is_tool_only and messages and messages[-1][0] == "assistant": # Append tool results to the previous assistant message @@ -169,6 +210,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: messages.append(("user", text)) elif msg_type == "assistant": text = _extract_content(msg_content, tool_use_map=tool_use_map) + if text: + text = strip_noise(text) if text: # If previous message is also assistant (multi-turn tool loop), # merge into the same assistant turn diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 7f0652a..53fc933 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -13,6 +13,7 @@ from mempalace.normalize import ( _try_normalize_json, _try_slack_json, normalize, + strip_noise, ) @@ -1048,3 +1049,148 @@ def test_normalize_rejects_large_file(): assert False, "Should have raised IOError" except IOError as e: assert "too large" in str(e).lower() + + +# ── strip_noise() — verbatim-safety boundary tests ───────────────────── +# +# The "Verbatim always" design principle requires that we never delete +# user-authored text. These tests pin down the boundary between system +# noise (which we strip) and user prose that happens to mention the same +# strings (which must survive untouched). + + +class TestStripNoisePreservesUserContent: + """User prose that mentions noise strings inline must be preserved.""" + + def test_user_discusses_stop_hook_in_prose(self): + # Regression: original regex with IGNORECASE + `.*\n?` ate the second + # sentence from real user commentary. + text = ( + "> User:\n" + "> Our CI has a stop hook that rejects merges after 5pm. " + "Ran 2 stop hooks last week.\n" + "> Assistant:\n" + "> Got it." + ) + assert strip_noise(text) == text.strip() + + def test_user_mentions_system_reminder_inline(self): + # Inline tags inside user prose (e.g. documenting + # Claude Code behavior) must not be stripped. + text = ( + "> User:\n" + "> Here is what Claude Code emits: " + "Auto-save reminder..." + " — I want to ignore it." + ) + assert strip_noise(text) == text.strip() + + def test_ctrl_o_hint_in_prose_preserved(self): + # Regression: original `.*\(ctrl\+o to expand\).*\n?` nuked the whole + # line whenever a user documented the TUI shortcut. + text = ( + "> User:\n" + "> In the TUI you hit (ctrl+o to expand) to see more. " + "That is the shortcut I want to document." + ) + assert strip_noise(text) == text.strip() + + def test_current_time_inline_in_prose(self): + text = "> User:\n> At CURRENT TIME: the meeting starts, not before." + assert strip_noise(text) == text.strip() + + def test_plus_n_lines_marker_inline(self): + text = "> User:\n> The log showed … +50 lines of stack trace, useful." + assert strip_noise(text) == text.strip() + + def test_dangling_open_tag_does_not_span_messages(self): + # THE span-eating bug: a stray unclosed in one + # message must NOT merge with a closing tag in another message and + # silently delete everything in between. + text = ( + "> User 1: normal content A\n" + "> Assistant: reply\n" + "> User 2: more content tail" + ) + out = strip_noise(text) + assert "Assistant: reply" in out + assert "User 2: more content" in out + assert "User 1: normal content" in out + + +class TestStripNoiseRemovesSystemChrome: + """System-injected noise with standalone/line-anchored shape must be stripped.""" + + def test_strips_line_anchored_system_reminder_block(self): + text = ( + "> User:\n" + "\n" + "Auto-save reminder...\n" + "\n" + "> Real message." + ) + out = strip_noise(text) + assert "system-reminder" not in out + assert "Auto-save reminder" not in out + assert "Real message." in out + + def test_strips_system_reminder_with_blockquote_prefix(self): + # _messages_to_transcript prefixes lines with "> ", so the line + # anchor must also accept that shape. + text = "> User:\n" "> Injected noise\n" "> Real message." + out = strip_noise(text) + assert "Injected noise" not in out + assert "Real message." in out + + def test_strips_standalone_ran_hook_line(self): + text = "Ran 2 Stop hook\n> User: real content" + out = strip_noise(text) + assert "Ran 2 Stop hook" not in out + assert "real content" in out + + def test_strips_known_hook_names(self): + for hook in ("Stop", "PreCompact", "PreToolUse", "PostToolUse", "UserPromptSubmit"): + text = f"Ran 1 {hook} hook\n> User: content" + assert hook not in strip_noise(text) + + def test_strips_current_time_standalone(self): + text = "CURRENT TIME: 2026-04-13 10:00 UTC\n> User: Hello" + out = strip_noise(text) + assert "CURRENT TIME" not in out + assert "Hello" in out + + def test_strips_collapsed_lines_marker(self): + text = "… +42 lines\n> User: Hello" + out = strip_noise(text) + assert "+42 lines" not in out + assert "Hello" in out + + def test_strips_token_count_ctrl_o_chrome(self): + # Claude Code's actual collapsed-output chrome: "[N tokens] (ctrl+o to expand)" + text = "> Assistant: some output [5 tokens] (ctrl+o to expand)\n> User: ok" + out = strip_noise(text) + assert "(ctrl+o to expand)" not in out + assert "[5 tokens]" not in out + assert "some output" in out + + def test_strips_each_known_noise_tag(self): + for tag in ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", + ): + text = f"> User:\n<{tag}>junk\n> Real." + out = strip_noise(text) + assert tag not in out, f"{tag} leaked into output" + assert "Real." in out + + def test_collapses_excessive_blank_lines(self): + text = "line one\n\n\n\n\n\nline two" + out = strip_noise(text) + assert "line one" in out + assert "line two" in out + # Should collapse to no more than 3 newlines + assert "\n\n\n\n" not in out From 7e5eeda9a5c22168719067d15af8b2424662f586 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:20:55 -0300 Subject: [PATCH 6/8] feat(normalize): auto-rebuild stale drawers via NORMALIZE_VERSION schema gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the strip_noise improvement only helps new mines. Every user who had already mined Claude Code JSONL sessions would keep their noise-polluted drawers forever, because convo_miner's file_already_mined skip short-circuits before re-processing. Adds a versioned schema gate so upgrades propagate silently: - palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline changes shape (this PR's strip_noise is the v1→v2 bump). - file_already_mined now returns False if the stored normalize_version is missing or less than current, triggering a rebuild on next mine. - Both miners stamp drawers with the current normalize_version. - convo_miner now purges stale drawers before inserting fresh chunks (mirrors miner.py's existing delete+insert), extracted into _file_convo_chunks helper to keep mine_convos under ruff's C901 limit. User experience: upgrade mempalace, run `mempalace mine` as usual, old noisy drawers get silently replaced with clean ones. No erase needed, no "you need to rebuild" changelog footgun. Tests: - test_file_already_mined_returns_false_for_stale_normalize_version — pins the version gate contract for missing/v1/current. - test_add_drawer_stamps_normalize_version — fresh project-miner drawers carry the field. - test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end proof that a pre-v2 palace gets silently cleaned on next mine, with orphan drawers purged and NOT skipped. Existing test_file_already_mined_check_mtime updated to include the new field; all other tests unaffected. --- mempalace/convo_miner.py | 83 ++++++++++++++++++++++------------ mempalace/miner.py | 3 +- mempalace/palace.py | 28 ++++++++++-- tests/test_convo_miner.py | 83 ++++++++++++++++++++++++++++++++++ tests/test_miner.py | 94 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 253 insertions(+), 38 deletions(-) diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py index d406073..663f1a0 100644 --- a/mempalace/convo_miner.py +++ b/mempalace/convo_miner.py @@ -16,7 +16,7 @@ from datetime import datetime from collections import defaultdict from .normalize import normalize -from .palace import SKIP_DIRS, get_collection, file_already_mined +from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection # File types that might contain conversations @@ -51,6 +51,7 @@ def _register_file(collection, source_file: str, wing: str, agent: str): "added_by": agent, "filed_at": datetime.now().isoformat(), "ingest_mode": "registry", + "normalize_version": NORMALIZE_VERSION, } ], ) @@ -272,6 +273,52 @@ def scan_convos(convo_dir: str) -> list: # ============================================================================= +def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode): + """Purge stale drawers for ``source_file`` then upsert fresh chunks. + + Returns (drawers_added, room_counts_delta). + """ + # Purge stale drawers first. When the normalize schema bumps, + # file_already_mined() returns False for pre-v2 drawers and we land + # here — clean them out so the source doesn't end up with a mix of + # old-noise and new-clean drawers. + try: + collection.delete(where={"source_file": source_file}) + except Exception: + pass + + room_counts_delta: dict = defaultdict(int) + drawers_added = 0 + for chunk in chunks: + chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room + if extract_mode == "general": + room_counts_delta[chunk_room] += 1 + drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" + try: + collection.upsert( + documents=[chunk["content"]], + ids=[drawer_id], + metadatas=[ + { + "wing": wing, + "room": chunk_room, + "source_file": source_file, + "chunk_index": chunk["chunk_index"], + "added_by": agent, + "filed_at": datetime.now().isoformat(), + "ingest_mode": "convos", + "extract_mode": extract_mode, + "normalize_version": NORMALIZE_VERSION, + } + ], + ) + drawers_added += 1 + except Exception as e: + if "already exists" not in str(e).lower(): + raise + return drawers_added, room_counts_delta + + def mine_convos( convo_dir: str, palace_path: str, @@ -375,34 +422,12 @@ def mine_convos( if extract_mode != "general": room_counts[room] += 1 - # File each chunk - drawers_added = 0 - for chunk in chunks: - chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room - if extract_mode == "general": - room_counts[chunk_room] += 1 - drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}" - try: - collection.upsert( - documents=[chunk["content"]], - ids=[drawer_id], - metadatas=[ - { - "wing": wing, - "room": chunk_room, - "source_file": source_file, - "chunk_index": chunk["chunk_index"], - "added_by": agent, - "filed_at": datetime.now().isoformat(), - "ingest_mode": "convos", - "extract_mode": extract_mode, - } - ], - ) - drawers_added += 1 - except Exception as e: - if "already exists" not in str(e).lower(): - raise + # Purge stale drawers + file fresh chunks. + drawers_added, room_delta = _file_convo_chunks( + collection, source_file, chunks, wing, room, agent, extract_mode + ) + for r, n in room_delta.items(): + room_counts[r] += n total_drawers += drawers_added print(f" ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}") diff --git a/mempalace/miner.py b/mempalace/miner.py index 22c8af3..49e0d25 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -15,7 +15,7 @@ from pathlib import Path from datetime import datetime from collections import defaultdict -from .palace import SKIP_DIRS, get_collection, file_already_mined +from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection READABLE_EXTENSIONS = { ".txt", @@ -381,6 +381,7 @@ def add_drawer( "chunk_index": chunk_index, "added_by": agent, "filed_at": datetime.now().isoformat(), + "normalize_version": NORMALIZE_VERSION, } # Store file mtime so we can detect modifications later. try: diff --git a/mempalace/palace.py b/mempalace/palace.py index 948fecc..9cfb55e 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -36,6 +36,16 @@ SKIP_DIRS = { _DEFAULT_BACKEND = ChromaBackend() +# Schema version for drawer normalization. Bump when the normalization +# pipeline changes in a way that existing drawers should be rebuilt to pick up +# (e.g., new noise-stripping rules). `file_already_mined` treats drawers with +# a missing or stale `normalize_version` as "not mined", so the next mine pass +# silently rebuilds them — users don't need to manually erase + re-mine. +# +# v2 (2026-04): introduced strip_noise() for Claude Code JSONL; previous +# drawers stored system tags / hook chrome verbatim. +NORMALIZE_VERSION = 2 + def get_collection( palace_path: str, @@ -53,16 +63,26 @@ def get_collection( def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool: """Check if a file has already been filed in the palace. - When check_mtime=True (used by project miner), returns False if the file - has been modified since it was last mined, so it gets re-mined. - When check_mtime=False (used by convo miner), just checks existence. + Returns False (so the file gets re-mined) when: + - no drawers exist for this source_file + - the stored `normalize_version` is missing or older than the current + schema (triggers silent rebuild after a normalization upgrade) + - `check_mtime=True` and the file's mtime differs from the stored one + + When check_mtime=True (used by project miner), also re-mines on content + change. When check_mtime=False (used by convo miner), transcripts are + assumed immutable, so only the version gate triggers a rebuild. """ try: results = collection.get(where={"source_file": source_file}, limit=1) if not results.get("ids"): return False + stored_meta = results.get("metadatas", [{}])[0] or {} + # Pre-v2 drawers have no version field — treat them as stale. + stored_version = stored_meta.get("normalize_version", 1) + if stored_version < NORMALIZE_VERSION: + return False if check_mtime: - stored_meta = results.get("metadatas", [{}])[0] stored_mtime = stored_meta.get("source_mtime") if stored_mtime is None: return False diff --git a/tests/test_convo_miner.py b/tests/test_convo_miner.py index f5074b4..166644b 100644 --- a/tests/test_convo_miner.py +++ b/tests/test_convo_miner.py @@ -75,3 +75,86 @@ def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys): assert "Files skipped (already filed): 1" in out2 finally: shutil.rmtree(tmpdir, ignore_errors=True) + + +def test_mine_convos_rebuilds_stale_drawers_after_schema_bump(capsys): + """When stored drawers have an older normalize_version, the next mine + silently purges them and refiles — no manual erase required. + + This is what makes the strip_noise upgrade apply to existing corpora: + users just run `mempalace mine` again and old noise-filled drawers get + replaced with clean ones.""" + from mempalace.palace import NORMALIZE_VERSION + + tmpdir = tempfile.mkdtemp() + try: + convo_path = Path(tmpdir) / "chat.txt" + convo_path.write_text( + "> What is memory?\nMemory is persistence.\n\n" + "> Why does it matter?\nIt enables continuity.\n\n" + "> How do we build it?\nWith structured storage.\n" + ) + palace_path = os.path.join(tmpdir, "palace") + + # First mine — stamps drawers with NORMALIZE_VERSION + mine_convos(tmpdir, palace_path, wing="test") + capsys.readouterr() + + client = chromadb.PersistentClient(path=palace_path) + col = client.get_collection("mempalace_drawers") + resolved = str(Path(tmpdir).resolve() / "chat.txt") + first_pass = col.get(where={"source_file": resolved}) + first_ids = set(first_pass["ids"]) + assert first_ids, "first mine should produce drawers" + for meta in first_pass["metadatas"]: + assert meta.get("normalize_version") == NORMALIZE_VERSION + + # Simulate pre-v2 drawers: rewrite metadata to an older version, + # and replace content with "noise" so we can see it get cleaned up. + stale_metas = [] + for meta in first_pass["metadatas"]: + stale = dict(meta) + stale["normalize_version"] = 1 + stale_metas.append(stale) + col.update( + ids=list(first_pass["ids"]), + documents=["STALE NOISE"] * len(first_pass["ids"]), + metadatas=stale_metas, + ) + # Add an extra orphan drawer that should also be purged. + col.add( + ids=["orphan_drawer"], + documents=["OLD ORPHAN"], + metadatas=[ + { + "wing": "test", + "room": "default", + "source_file": resolved, + "chunk_index": 999, + "normalize_version": 1, + } + ], + ) + del col, client + + # Second mine — version gate should trigger rebuild + mine_convos(tmpdir, palace_path, wing="test") + out = capsys.readouterr().out + assert ( + "Files skipped (already filed): 0" in out + ), "stale drawers should force a rebuild, not a skip" + + client = chromadb.PersistentClient(path=palace_path) + col = client.get_collection("mempalace_drawers") + rebuilt = col.get(where={"source_file": resolved}) + # Orphan is gone + assert "orphan_drawer" not in rebuilt["ids"] + # No stale content survived + assert all("STALE NOISE" not in d for d in rebuilt["documents"]) + assert all("OLD ORPHAN" not in d for d in rebuilt["documents"]) + # All rebuilt drawers carry the current version + for meta in rebuilt["metadatas"]: + assert meta.get("normalize_version") == NORMALIZE_VERSION + del col, client + finally: + shutil.rmtree(tmpdir, ignore_errors=True) diff --git a/tests/test_miner.py b/tests/test_miner.py index ea2f2a9..020d5bd 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -7,7 +7,7 @@ import chromadb import yaml from mempalace.miner import mine, scan_project, status -from mempalace.palace import file_already_mined +from mempalace.palace import NORMALIZE_VERSION, file_already_mined def write_file(path: Path, content: str): @@ -227,11 +227,17 @@ def test_file_already_mined_check_mtime(): assert file_already_mined(col, test_file) is False assert file_already_mined(col, test_file, check_mtime=True) is False - # Add it with mtime + # Add it with mtime + current normalize_version col.add( ids=["d1"], documents=["hello world"], - metadatas=[{"source_file": test_file, "source_mtime": str(mtime)}], + metadatas=[ + { + "source_file": test_file, + "source_mtime": str(mtime), + "normalize_version": NORMALIZE_VERSION, + } + ], ) # Already mined (no mtime check) @@ -253,7 +259,12 @@ def test_file_already_mined_check_mtime(): col.add( ids=["d2"], documents=["other"], - metadatas=[{"source_file": "/fake/no_mtime.txt"}], + metadatas=[ + { + "source_file": "/fake/no_mtime.txt", + "normalize_version": NORMALIZE_VERSION, + } + ], ) assert file_already_mined(col, "/fake/no_mtime.txt", check_mtime=True) is False finally: @@ -296,3 +307,78 @@ def test_status_missing_palace_does_not_create_empty_collection(tmp_path, capsys out = capsys.readouterr().out assert "No palace found" in out assert not palace_path.exists() + + +# ── normalize_version schema gate ─────────────────────────────────────── +# +# When the normalization pipeline changes shape (e.g., strip_noise lands), +# `NORMALIZE_VERSION` is bumped so pre-existing drawers can be silently +# rebuilt on the next mine. These tests pin that contract. + + +def test_file_already_mined_returns_false_for_stale_normalize_version(): + """Pre-v2 drawers (no field, or older integer) must not short-circuit.""" + tmpdir = tempfile.mkdtemp() + try: + palace_path = os.path.join(tmpdir, "palace") + os.makedirs(palace_path) + client = chromadb.PersistentClient(path=palace_path) + col = client.get_or_create_collection("mempalace_drawers") + + # Pre-v2 drawer: no normalize_version field at all + col.add( + ids=["d_old"], + documents=["old"], + metadatas=[{"source_file": "/fake/old.jsonl"}], + ) + assert file_already_mined(col, "/fake/old.jsonl") is False + + # Explicitly older version + col.add( + ids=["d_v1"], + documents=["v1"], + metadatas=[{"source_file": "/fake/v1.jsonl", "normalize_version": 1}], + ) + assert file_already_mined(col, "/fake/v1.jsonl") is False + + # Current version — short-circuits + col.add( + ids=["d_current"], + documents=["cur"], + metadatas=[ + { + "source_file": "/fake/current.jsonl", + "normalize_version": NORMALIZE_VERSION, + } + ], + ) + assert file_already_mined(col, "/fake/current.jsonl") is True + finally: + del col, client + shutil.rmtree(tmpdir, ignore_errors=True) + + +def test_add_drawer_stamps_normalize_version(tmp_path): + """Fresh drawers carry the current schema version so future upgrades work.""" + from mempalace.miner import add_drawer + + palace_path = tmp_path / "palace" + palace_path.mkdir() + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.get_or_create_collection("mempalace_drawers") + try: + added = add_drawer( + collection=col, + wing="test", + room="notes", + content="hello", + source_file=str(tmp_path / "src.md"), + chunk_index=0, + agent="unit", + ) + assert added is True + stored = col.get(limit=1) + meta = stored["metadatas"][0] + assert meta["normalize_version"] == NORMALIZE_VERSION + finally: + del col, client From a3b7988d8791e10877293131e1f7c936e9a8aee1 Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:50:07 -0700 Subject: [PATCH 7/8] =?UTF-8?q?fix:=20stop=20hooks=20from=20making=20agent?= =?UTF-8?q?s=20write=20in=20chat=20=E2=80=94=20save=20tokens?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The save hook and precompact hook were telling the agent to write diary entries, add drawers, and add KG triples IN THE CHAT WINDOW. Every line written stays in conversation history and retransmits on every subsequent turn — ~$1/session in wasted tokens. Fix: hooks now say "saved in background, no action needed" and use decision: allow instead of block. The agent continues working without interruption. All filing happens via the background pipeline. Also updated hooks README with: - Known limitation: hooks require session restart after install - Updated cost section: zero tokens, background-only Co-Authored-By: Claude Opus 4.6 (1M context) --- hooks/README.md | 6 +++++- hooks/mempal_precompact_hook.sh | 6 +++--- hooks/mempal_save_hook.sh | 11 +++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/hooks/README.md b/hooks/README.md index d5380ef..977b109 100644 --- a/hooks/README.md +++ b/hooks/README.md @@ -133,6 +133,10 @@ Example output: [14:40:01] Session abc123: 18 exchanges, 3 since last save ``` +## Known Limitations + +**Hooks require session restart after install.** Claude Code loads hooks from `settings.json` at session start only. If you run `mempalace init` or manually edit hook config mid-session, the hooks won't fire until you restart Claude Code. This is a Claude Code limitation. + ## Cost -**Zero extra tokens.** The hooks are bash scripts that run locally. They don't call any API. The only "cost" is the AI spending a few seconds organizing memories at each checkpoint — and it's doing that with context it already has loaded. +**Zero extra tokens.** The hooks notify the AI that saves happened in the background — the AI doesn't need to write anything in the chat. All filing is handled automatically. Previous versions asked the AI to write diary entries and drawer content in the chat window, which cost ~$1/session in retransmitted tokens. diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh index 550a813..1c14193 100755 --- a/hooks/mempal_precompact_hook.sh +++ b/hooks/mempal_precompact_hook.sh @@ -68,10 +68,10 @@ if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then python3 -m mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1 fi -# Always block — compaction = save everything +# Notify — compaction is about to happen but filing is handled in background cat << 'HOOKJSON' { - "decision": "block", - "reason": "COMPACTION IMMINENT. Save ALL topics, decisions, quotes, code, and important context from this session to your memory system. Be thorough — after compaction, detailed context will be lost. Organize into appropriate categories. Use verbatim quotes where possible. Save everything, then allow compaction to proceed." + "decision": "allow", + "reason": "MemPalace pre-compaction save. Your full conversation has been saved verbatim in the background — no action needed. Compaction can proceed safely." } HOOKJSON diff --git a/hooks/mempal_save_hook.sh b/hooks/mempal_save_hook.sh index a0e4681..b15d961 100755 --- a/hooks/mempal_save_hook.sh +++ b/hooks/mempal_save_hook.sh @@ -140,12 +140,15 @@ if [ "$SINCE_LAST" -ge "$SAVE_INTERVAL" ] && [ "$EXCHANGE_COUNT" -gt 0 ]; then python3 -m mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1 & fi - # Block the AI and tell it to save - # The "reason" becomes a system message the AI sees and acts on + # Notify the AI that a checkpoint happened — but do NOT ask it to write + # anything in chat. All filing happens in the background via the pipeline. + # The old version asked the agent to write diary entries, add drawers, and + # add KG triples in the chat window — that cost ~$1/session in retransmitted + # tokens and cluttered the conversation. cat << 'HOOKJSON' { - "decision": "block", - "reason": "AUTO-SAVE checkpoint. Save key topics, decisions, quotes, and code from this session to your memory system. Organize into appropriate categories. Use verbatim quotes where possible. Continue conversation after saving." + "decision": "allow", + "reason": "MemPalace auto-save checkpoint. Your conversation is being saved verbatim in the background — no action needed from you. Continue working." } HOOKJSON else From 5db651a543a7617a7eec6d08fc7e571d4842e056 Mon Sep 17 00:00:00 2001 From: shafdev <96260000+shafdev@users.noreply.github.com> Date: Tue, 14 Apr 2026 01:36:04 +0530 Subject: [PATCH 8/8] fix: use microsecond timestamp and full content hash in diary entry ID (#819) --- mempalace/mcp_server.py | 5 ++++- tests/test_mcp_server.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index 4e21426..33933ff 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -836,7 +836,10 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general"): return _no_palace() now = datetime.now() - entry_id = f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S')}_{hashlib.sha256(entry[:50].encode()).hexdigest()[:12]}" + entry_id = ( + f"diary_{wing}_{now.strftime('%Y%m%d_%H%M%S%f')}_" + f"{hashlib.sha256(entry.encode()).hexdigest()[:12]}" + ) _wal_log( "diary_write", diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 4cc8b4a..a8189ae 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -6,6 +6,7 @@ dispatch layer (integration-level). Uses isolated palace + KG fixtures via monkeypatch to avoid touching real data. """ +from datetime import datetime import json import sys @@ -643,6 +644,48 @@ class TestDiaryTools: r = tool_diary_read(agent_name="Nobody") assert r["entries"] == [] + def test_diary_write_same_second_shared_prefix_no_collision( + self, monkeypatch, config, palace_path, kg + ): + _patch_mcp_server(monkeypatch, config, kg) + _client, _col = _get_collection(palace_path, create=True) + del _client + + from mempalace import mcp_server + + class FrozenDateTime: + calls = [ + datetime(2026, 4, 13, 22, 15, 30, 123456), + datetime(2026, 4, 13, 22, 15, 30, 123457), + ] + fallback = datetime(2026, 4, 13, 22, 15, 30, 123457) + + @classmethod + def now(cls): + if cls.calls: + return cls.calls.pop(0) + return cls.fallback + + monkeypatch.setattr(mcp_server, "datetime", FrozenDateTime) + + from mempalace.mcp_server import tool_diary_read, tool_diary_write + + entry1 = "A" * 50 + " entry one" + entry2 = "A" * 50 + " entry two" + + result1 = tool_diary_write(agent_name="TestAgent", entry=entry1, topic="status") + result2 = tool_diary_write(agent_name="TestAgent", entry=entry2, topic="status") + + assert result1["success"] is True + assert result2["success"] is True + assert result1["entry_id"] != result2["entry_id"] + + read_result = tool_diary_read(agent_name="TestAgent") + contents = [entry["content"] for entry in read_result["entries"]] + assert read_result["total"] == 2 + assert entry1 in contents + assert entry2 in contents + # ── Cache Invalidation (inode/mtime) ──────────────────────────────────