From f4440f1ce0b673a71bcc7c8f60432389aed0be06 Mon Sep 17 00:00:00 2001
From: MSL <232237854+milla-jovovich@users.noreply.github.com>
Date: Mon, 27 Apr 2026 00:44:40 -0700
Subject: [PATCH 1/3] feat(normalize): Gemini CLI session JSONL adapter

Adds a fifth format adapter to mempalace.normalize alongside the
existing Claude Code, Codex, Claude.ai, ChatGPT, and Slack parsers.
After this lands, mempalace mine --mode convos ingests Gemini CLI
session history without manual export.

Why now: Claude Code and Codex CLI are already supported by convo_miner;
adding Gemini closes the major-CLI-tool coverage gap. After this lands,
the README's "verbatim conversation history" promise is honestly
delivered for all three top-tier API-keyed coding CLIs (Claude Code,
Codex CLI, Gemini CLI), not just two of them. This is the third leg
of the trio Aya pushed for so the public claim matches the actual
ingest pipeline.

Gemini CLI stores sessions at ~/.gemini/tmp/<project_hash>/chats/ as
JSONL. The on-disk schema (per google-gemini/gemini-cli#15292):

    {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
    {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
    {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
    {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}

The new _try_gemini_jsonl parser:

  - requires a session_metadata record so it does not false-positive
    against Claude Code or Codex JSONL passing through the dispatch
    chain in _try_normalize_json
  - extracts user/gemini message text from each entry's content array
    of {"text": "..."} blocks, joining multiple blocks per message
    in order
  - skips message_update entries (token-count deltas with no message
    text) and any other unknown record types
  - returns None when fewer than two conversational messages are
    present, mirroring the codex parser's >=2-message guard

Test coverage: 9 new unit tests in tests/test_normalize.py mirroring
the codex test pattern - happy path, multi-turn, missing session
metadata, message_update skip, single-message rejection, multi-block
content concatenation, empty content skip, malformed-line resilience,
and explicit no-match against codex JSONL fixtures. Schema-level only;
real Gemini CLI session fixtures are a follow-up once a real user file
is available.

Closes part of #59 (the Gemini CLI portion of the umbrella request).
---
 mempalace/normalize.py  |  64 ++++++++++++++++++
 tests/test_normalize.py | 144 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+)

diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 29516aa..f018935 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -157,6 +157,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
     if normalized:
         return normalized
 
+    normalized = _try_gemini_jsonl(content)
+    if normalized:
+        return normalized
+
     try:
         data = json.loads(content)
     except json.JSONDecodeError:
@@ -280,6 +284,66 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
     return None
 
 
+def _try_gemini_jsonl(content: str) -> Optional[str]:
+    """Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
+
+    Schema (per google-gemini/gemini-cli#15292): a session_metadata record
+    on the first line, then a stream of ``{"type": "user", "content":
+    [{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
+    records, with optional ``message_update`` records carrying token
+    counts only.
+
+    Detection requires a ``session_metadata`` record so this parser does
+    not false-positive against Claude Code or Codex JSONL passed through
+    the dispatch chain. ``message_update`` entries are skipped — they
+    have no message text. Multiple text blocks within a single message's
+    content array are concatenated in order, separated by newlines.
+    """
+    lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
+    messages = []
+    has_session_metadata = False
+    for line in lines:
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(entry, dict):
+            continue
+
+        entry_type = entry.get("type", "")
+        if entry_type == "session_metadata":
+            has_session_metadata = True
+            continue
+
+        if entry_type not in ("user", "gemini"):
+            # Skips message_update, system events, anything else.
+            continue
+
+        content_blocks = entry.get("content", [])
+        if not isinstance(content_blocks, list):
+            continue
+
+        parts = []
+        for block in content_blocks:
+            if not isinstance(block, dict):
+                continue
+            text = block.get("text", "")
+            if isinstance(text, str) and text.strip():
+                parts.append(text)
+        if not parts:
+            continue
+        joined = "\n".join(parts)
+
+        if entry_type == "user":
+            messages.append(("user", joined))
+        else:  # "gemini"
+            messages.append(("assistant", joined))
+
+    if len(messages) >= 2 and has_session_metadata:
+        return _messages_to_transcript(messages)
+    return None
+
+
 def _try_claude_ai_json(data) -> Optional[str]:
     """Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
     if isinstance(data, dict):
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
index c175450..be408d5 100644
--- a/tests/test_normalize.py
+++ b/tests/test_normalize.py
@@ -11,6 +11,7 @@ from mempalace.normalize import (
     _try_claude_ai_json,
     _try_claude_code_jsonl,
     _try_codex_jsonl,
+    _try_gemini_jsonl,
     _try_normalize_json,
     _try_slack_json,
     normalize,
@@ -450,6 +451,149 @@ def test_codex_jsonl_payload_not_dict():
     assert result is not None
 
 
+# ── _try_gemini_jsonl ──────────────────────────────────────────────────
+#
+# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
+# JSONL. The schema (per google-gemini/gemini-cli#15292):
+#
+#   {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
+#   {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
+#   {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
+#   {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
+#
+# Detection requires a ``session_metadata`` record so this parser does
+# not false-positive against Claude Code or Codex JSONL. ``message_update``
+# entries (token-count deltas only) are skipped — they carry no message
+# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
+# all text blocks for a given message.
+
+
+def test_gemini_jsonl_valid():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
+        json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
+        json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Hello" in result
+    assert "Hi there" in result
+
+
+def test_gemini_jsonl_multi_turn():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "s"}),
+        json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
+        json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q1" in result
+    assert "A1" in result
+    assert "> Q2" in result
+    assert "A2" in result
+
+
+def test_gemini_jsonl_no_session_metadata():
+    """Without session_metadata, parser returns None — guards against false
+    positives on Claude Code / Codex JSONL passed through the dispatch chain."""
+    lines = [
+        json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_skips_message_update():
+    """message_update records carry only token counts — must be ignored,
+    not turned into empty drawers or duplicated assistant turns."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+        json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "tokens" not in result
+    assert "input" not in result
+
+
+def test_gemini_jsonl_too_few_messages():
+    """Mirror codex/claude_code behavior: < 2 conversational messages = None."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_multi_block_content():
+    """A single message can have multiple text blocks in its content array
+    (e.g. a thinking block + a final answer). Both should be concatenated
+    into one transcript turn, in order."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps(
+            {
+                "type": "gemini",
+                "content": [{"text": "First part."}, {"text": "Second part."}],
+            }
+        ),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "First part." in result
+    assert "Second part." in result
+
+
+def test_gemini_jsonl_empty_content_skipped():
+    """A message whose content array yields no text should be skipped, not
+    emit an empty turn that would corrupt the transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": []}),
+        json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> real Q" in result
+    assert "real A" in result
+
+
+def test_gemini_jsonl_invalid_json_lines_skipped():
+    """A malformed line in the middle of the stream must not abort parsing —
+    the rest of the session should still produce a transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        "not-valid-json{",
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q" in result
+
+
+def test_gemini_jsonl_does_not_match_codex():
+    """Codex JSONL passed in must NOT be parsed by the gemini adapter — the
+    dispatch chain in _try_normalize_json relies on each adapter returning
+    None when it doesn't recognize a format."""
+    lines = [
+        json.dumps({"type": "session_meta", "payload": {}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
 # ── _try_claude_ai_json ───────────────────────────────────────────────
 
 

From a3e3691e864c8920114c53aa37faa8dfc6c440c3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 19:00:18 +0000
Subject: [PATCH 2/3] docs(normalize): add Gemini CLI JSONL to module-level
 supported formats list

Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/a32f48bb-2a78-494a-9698-e69304732d3f

Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
---
 mempalace/normalize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index f018935..9326a62 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -8,6 +8,7 @@ Supported:
     - ChatGPT conversations.json
     - Claude Code JSONL (with tool_use/tool_result block capture)
     - OpenAI Codex CLI JSONL
+    - Gemini CLI JSONL (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl)
     - Slack JSON export
     - Plain text (pass through for paragraph chunking)
 

From e7fe6cae144b97f1f33a1def2a1150ca219cf6ba Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 27 Apr 2026 21:41:48 +0000
Subject: [PATCH 3/3] fix(normalize): discard user/gemini turns before
 session_metadata sentinel

Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/4511e9aa-38e7-440e-a6f8-eda91e576f0f

Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
---
 mempalace/normalize.py  | 14 +++++++++++---
 tests/test_normalize.py | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 9326a62..4252afa 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -296,9 +296,12 @@ def _try_gemini_jsonl(content: str) -> Optional[str]:
 
     Detection requires a ``session_metadata`` record so this parser does
     not false-positive against Claude Code or Codex JSONL passed through
-    the dispatch chain. ``message_update`` entries are skipped — they
-    have no message text. Multiple text blocks within a single message's
-    content array are concatenated in order, separated by newlines.
+    the dispatch chain. Any ``user``/``gemini`` lines that appear before
+    ``session_metadata`` are discarded — they are treated as preamble
+    noise, not conversational turns. ``message_update`` entries are
+    skipped — they have no message text. Multiple text blocks within a
+    single message's content array are concatenated in order, separated
+    by newlines.
     """
     lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
     messages = []
@@ -316,6 +319,11 @@ def _try_gemini_jsonl(content: str) -> Optional[str]:
             has_session_metadata = True
             continue
 
+        # Discard everything (including user/gemini turns) until the
+        # session_metadata sentinel has been seen.
+        if not has_session_metadata:
+            continue
+
         if entry_type not in ("user", "gemini"):
             # Skips message_update, system events, anything else.
             continue
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
index be408d5..2b0f180 100644
--- a/tests/test_normalize.py
+++ b/tests/test_normalize.py
@@ -594,6 +594,25 @@ def test_gemini_jsonl_does_not_match_codex():
     assert result is None
 
 
+def test_gemini_jsonl_messages_before_session_metadata_discarded():
+    """user/gemini turns that appear before the session_metadata sentinel must
+    be silently discarded, not counted as conversational messages.  Only turns
+    after the sentinel contribute to the transcript."""
+    lines = [
+        json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}),
+        json.dumps({"type": "session_metadata", "sessionId": "s"}),
+        json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "preamble Q" not in result
+    assert "preamble A" not in result
+    assert "> real Q" in result
+    assert "real A" in result
+
+
 # ── _try_claude_ai_json ───────────────────────────────────────────────