From 9b99c136ee13c1dc97f3c20bb2f59f4464d7284d Mon Sep 17 00:00:00 2001 From: MSL <232237854+milla-jovovich@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:55:25 -0700 Subject: [PATCH] fix: strip system tags, hook output, and Claude UI chrome from drawers normalize.py now strips before filing: - , , tags - , , tags - Hook status messages (CURRENT TIME, Checking verified facts, etc.) - Claude Code UI chrome (ctrl+o to expand, progress bars, etc.) - Collapsed runs of blank lines This noise was going straight into drawers, wasting storage space and polluting search results. strip_noise() runs on all normalized output regardless of input format (JSONL, JSON, plain text). 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- mempalace/normalize.py | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index e599df9..256a5e9 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -16,10 +16,54 @@ No API key. No internet. Everything local. import json import os +import re from pathlib import Path from typing import Optional +# ─── Noise stripping ───────────────────────────────────────────────────── +# Claude Code and other tools inject system tags, hook output, UI chrome, +# and tool-call JSON into transcripts. These waste drawer space and pollute +# search results. Strip them before filing. + +_NOISE_TAG_PATTERNS = [ + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), + re.compile(r"]*>.*?", re.DOTALL), +] + +_NOISE_STRINGS = [ + "CURRENT TIME:", + "VERIFIED FACTS (do not contradict)", + "AGENT SPECIALIZATION:", + "Checking verified facts...", + "Injecting timestamp...", + "Starting background pipeline...", + "Checking emotional weights...", + "Auto-save reminder...", + "Checking pipeline...", + "MemPalace auto-save checkpoint.", +] + + +def strip_noise(text: str) -> str: + """Remove system tags, hook output, and Claude Code UI chrome from text.""" + for pat in _NOISE_TAG_PATTERNS: + text = pat.sub("", text) + for noise in _NOISE_STRINGS: + text = text.replace(noise, "") + # Strip Claude Code UI chrome + text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) + text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) + text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) + # Collapse runs of blank lines + text = re.sub(r"\n{4,}", "\n\n\n", text) + return text.strip() + + def normalize(filepath: str) -> str: """ Load a file and normalize to transcript format if it's a chat export. @@ -40,19 +84,23 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through + # Already has > markers — pass through (strip noise but preserve trailing newline) lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - return content + cleaned = strip_noise(content) + # Preserve trailing newline if original had one + if content.endswith("\n") and not cleaned.endswith("\n"): + cleaned += "\n" + return cleaned # Try JSON normalization ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return normalized + return strip_noise(normalized) - return content + return strip_noise(content) def _try_normalize_json(content: str) -> Optional[str]: