Merge pull request #1234 from MemPalace/feat/normalize-gemini-cli

feat(normalize): Gemini CLI session JSONL adapter
This commit is contained in:
Igor Lins e Silva
2026-04-27 20:42:06 -03:00
committed by GitHub
2 changed files with 236 additions and 0 deletions
+73
View File
@@ -8,6 +8,7 @@ Supported:
- ChatGPT conversations.json
- Claude Code JSONL (with tool_use/tool_result block capture)
- OpenAI Codex CLI JSONL
- Gemini CLI JSONL (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl)
- Slack JSON export
- Plain text (pass through for paragraph chunking)
@@ -157,6 +158,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
if normalized:
return normalized
normalized = _try_gemini_jsonl(content)
if normalized:
return normalized
try:
data = json.loads(content)
except json.JSONDecodeError:
@@ -280,6 +285,74 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
return None
def _try_gemini_jsonl(content: str) -> Optional[str]:
"""Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
Schema (per google-gemini/gemini-cli#15292): a session_metadata record
on the first line, then a stream of ``{"type": "user", "content":
[{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
records, with optional ``message_update`` records carrying token
counts only.
Detection requires a ``session_metadata`` record so this parser does
not false-positive against Claude Code or Codex JSONL passed through
the dispatch chain. Any ``user``/``gemini`` lines that appear before
``session_metadata`` are discarded — they are treated as preamble
noise, not conversational turns. ``message_update`` entries are
skipped — they have no message text. Multiple text blocks within a
single message's content array are concatenated in order, separated
by newlines.
"""
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
messages = []
has_session_metadata = False
for line in lines:
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
if not isinstance(entry, dict):
continue
entry_type = entry.get("type", "")
if entry_type == "session_metadata":
has_session_metadata = True
continue
# Discard everything (including user/gemini turns) until the
# session_metadata sentinel has been seen.
if not has_session_metadata:
continue
if entry_type not in ("user", "gemini"):
# Skips message_update, system events, anything else.
continue
content_blocks = entry.get("content", [])
if not isinstance(content_blocks, list):
continue
parts = []
for block in content_blocks:
if not isinstance(block, dict):
continue
text = block.get("text", "")
if isinstance(text, str) and text.strip():
parts.append(text)
if not parts:
continue
joined = "\n".join(parts)
if entry_type == "user":
messages.append(("user", joined))
else: # "gemini"
messages.append(("assistant", joined))
if len(messages) >= 2 and has_session_metadata:
return _messages_to_transcript(messages)
return None
def _try_claude_ai_json(data) -> Optional[str]:
"""Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
if isinstance(data, dict):
+163
View File
@@ -11,6 +11,7 @@ from mempalace.normalize import (
_try_claude_ai_json,
_try_claude_code_jsonl,
_try_codex_jsonl,
_try_gemini_jsonl,
_try_normalize_json,
_try_slack_json,
normalize,
@@ -450,6 +451,168 @@ def test_codex_jsonl_payload_not_dict():
assert result is not None
# ── _try_gemini_jsonl ──────────────────────────────────────────────────
#
# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
# JSONL. The schema (per google-gemini/gemini-cli#15292):
#
# {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
# {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
# {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
# {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
#
# Detection requires a ``session_metadata`` record so this parser does
# not false-positive against Claude Code or Codex JSONL. ``message_update``
# entries (token-count deltas only) are skipped — they carry no message
# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
# all text blocks for a given message.
def test_gemini_jsonl_valid():
lines = [
json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Hello" in result
assert "Hi there" in result
def test_gemini_jsonl_multi_turn():
lines = [
json.dumps({"type": "session_metadata", "sessionId": "s"}),
json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Q1" in result
assert "A1" in result
assert "> Q2" in result
assert "A2" in result
def test_gemini_jsonl_no_session_metadata():
"""Without session_metadata, parser returns None — guards against false
positives on Claude Code / Codex JSONL passed through the dispatch chain."""
lines = [
json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
def test_gemini_jsonl_skips_message_update():
"""message_update records carry only token counts — must be ignored,
not turned into empty drawers or duplicated assistant turns."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "tokens" not in result
assert "input" not in result
def test_gemini_jsonl_too_few_messages():
"""Mirror codex/claude_code behavior: < 2 conversational messages = None."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
def test_gemini_jsonl_multi_block_content():
"""A single message can have multiple text blocks in its content array
(e.g. a thinking block + a final answer). Both should be concatenated
into one transcript turn, in order."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps(
{
"type": "gemini",
"content": [{"text": "First part."}, {"text": "Second part."}],
}
),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "First part." in result
assert "Second part." in result
def test_gemini_jsonl_empty_content_skipped():
"""A message whose content array yields no text should be skipped, not
emit an empty turn that would corrupt the transcript."""
lines = [
json.dumps({"type": "session_metadata"}),
json.dumps({"type": "user", "content": []}),
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> real Q" in result
assert "real A" in result
def test_gemini_jsonl_invalid_json_lines_skipped():
"""A malformed line in the middle of the stream must not abort parsing —
the rest of the session should still produce a transcript."""
lines = [
json.dumps({"type": "session_metadata"}),
"not-valid-json{",
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "> Q" in result
def test_gemini_jsonl_does_not_match_codex():
"""Codex JSONL passed in must NOT be parsed by the gemini adapter — the
dispatch chain in _try_normalize_json relies on each adapter returning
None when it doesn't recognize a format."""
lines = [
json.dumps({"type": "session_meta", "payload": {}}),
json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is None
def test_gemini_jsonl_messages_before_session_metadata_discarded():
"""user/gemini turns that appear before the session_metadata sentinel must
be silently discarded, not counted as conversational messages. Only turns
after the sentinel contribute to the transcript."""
lines = [
json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}),
json.dumps({"type": "session_metadata", "sessionId": "s"}),
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
]
result = _try_gemini_jsonl("\n".join(lines))
assert result is not None
assert "preamble Q" not in result
assert "preamble A" not in result
assert "> real Q" in result
assert "real A" in result
# ── _try_claude_ai_json ───────────────────────────────────────────────