From e7fe6cae144b97f1f33a1def2a1150ca219cf6ba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 21:41:48 +0000 Subject: [PATCH] fix(normalize): discard user/gemini turns before session_metadata sentinel Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/4511e9aa-38e7-440e-a6f8-eda91e576f0f Co-authored-by: igorls <4753812+igorls@users.noreply.github.com> --- mempalace/normalize.py | 14 +++++++++++--- tests/test_normalize.py | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/mempalace/normalize.py b/mempalace/normalize.py index 9326a62..4252afa 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -296,9 +296,12 @@ def _try_gemini_jsonl(content: str) -> Optional[str]: Detection requires a ``session_metadata`` record so this parser does not false-positive against Claude Code or Codex JSONL passed through - the dispatch chain. ``message_update`` entries are skipped — they - have no message text. Multiple text blocks within a single message's - content array are concatenated in order, separated by newlines. + the dispatch chain. Any ``user``/``gemini`` lines that appear before + ``session_metadata`` are discarded — they are treated as preamble + noise, not conversational turns. ``message_update`` entries are + skipped — they have no message text. Multiple text blocks within a + single message's content array are concatenated in order, separated + by newlines. """ lines = [line.strip() for line in content.strip().split("\n") if line.strip()] messages = [] @@ -316,6 +319,11 @@ def _try_gemini_jsonl(content: str) -> Optional[str]: has_session_metadata = True continue + # Discard everything (including user/gemini turns) until the + # session_metadata sentinel has been seen. + if not has_session_metadata: + continue + if entry_type not in ("user", "gemini"): # Skips message_update, system events, anything else. continue diff --git a/tests/test_normalize.py b/tests/test_normalize.py index be408d5..2b0f180 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -594,6 +594,25 @@ def test_gemini_jsonl_does_not_match_codex(): assert result is None +def test_gemini_jsonl_messages_before_session_metadata_discarded(): + """user/gemini turns that appear before the session_metadata sentinel must + be silently discarded, not counted as conversational messages. Only turns + after the sentinel contribute to the transcript.""" + lines = [ + json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}), + json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}), + json.dumps({"type": "session_metadata", "sessionId": "s"}), + json.dumps({"type": "user", "content": [{"text": "real Q"}]}), + json.dumps({"type": "gemini", "content": [{"text": "real A"}]}), + ] + result = _try_gemini_jsonl("\n".join(lines)) + assert result is not None + assert "preamble Q" not in result + assert "preamble A" not in result + assert "> real Q" in result + assert "real A" in result + + # ── _try_claude_ai_json ───────────────────────────────────────────────