Merge pull request #1234 from MemPalace/feat/normalize-gemini-cli
feat(normalize): Gemini CLI session JSONL adapter
This commit is contained in:
@@ -8,6 +8,7 @@ Supported:
|
|||||||
- ChatGPT conversations.json
|
- ChatGPT conversations.json
|
||||||
- Claude Code JSONL (with tool_use/tool_result block capture)
|
- Claude Code JSONL (with tool_use/tool_result block capture)
|
||||||
- OpenAI Codex CLI JSONL
|
- OpenAI Codex CLI JSONL
|
||||||
|
- Gemini CLI JSONL (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl)
|
||||||
- Slack JSON export
|
- Slack JSON export
|
||||||
- Plain text (pass through for paragraph chunking)
|
- Plain text (pass through for paragraph chunking)
|
||||||
|
|
||||||
@@ -157,6 +158,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
|
|||||||
if normalized:
|
if normalized:
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
normalized = _try_gemini_jsonl(content)
|
||||||
|
if normalized:
|
||||||
|
return normalized
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(content)
|
data = json.loads(content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
@@ -280,6 +285,74 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _try_gemini_jsonl(content: str) -> Optional[str]:
|
||||||
|
"""Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
|
||||||
|
|
||||||
|
Schema (per google-gemini/gemini-cli#15292): a session_metadata record
|
||||||
|
on the first line, then a stream of ``{"type": "user", "content":
|
||||||
|
[{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
|
||||||
|
records, with optional ``message_update`` records carrying token
|
||||||
|
counts only.
|
||||||
|
|
||||||
|
Detection requires a ``session_metadata`` record so this parser does
|
||||||
|
not false-positive against Claude Code or Codex JSONL passed through
|
||||||
|
the dispatch chain. Any ``user``/``gemini`` lines that appear before
|
||||||
|
``session_metadata`` are discarded — they are treated as preamble
|
||||||
|
noise, not conversational turns. ``message_update`` entries are
|
||||||
|
skipped — they have no message text. Multiple text blocks within a
|
||||||
|
single message's content array are concatenated in order, separated
|
||||||
|
by newlines.
|
||||||
|
"""
|
||||||
|
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
|
||||||
|
messages = []
|
||||||
|
has_session_metadata = False
|
||||||
|
for line in lines:
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
entry_type = entry.get("type", "")
|
||||||
|
if entry_type == "session_metadata":
|
||||||
|
has_session_metadata = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Discard everything (including user/gemini turns) until the
|
||||||
|
# session_metadata sentinel has been seen.
|
||||||
|
if not has_session_metadata:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if entry_type not in ("user", "gemini"):
|
||||||
|
# Skips message_update, system events, anything else.
|
||||||
|
continue
|
||||||
|
|
||||||
|
content_blocks = entry.get("content", [])
|
||||||
|
if not isinstance(content_blocks, list):
|
||||||
|
continue
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for block in content_blocks:
|
||||||
|
if not isinstance(block, dict):
|
||||||
|
continue
|
||||||
|
text = block.get("text", "")
|
||||||
|
if isinstance(text, str) and text.strip():
|
||||||
|
parts.append(text)
|
||||||
|
if not parts:
|
||||||
|
continue
|
||||||
|
joined = "\n".join(parts)
|
||||||
|
|
||||||
|
if entry_type == "user":
|
||||||
|
messages.append(("user", joined))
|
||||||
|
else: # "gemini"
|
||||||
|
messages.append(("assistant", joined))
|
||||||
|
|
||||||
|
if len(messages) >= 2 and has_session_metadata:
|
||||||
|
return _messages_to_transcript(messages)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _try_claude_ai_json(data) -> Optional[str]:
|
def _try_claude_ai_json(data) -> Optional[str]:
|
||||||
"""Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
|
"""Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from mempalace.normalize import (
|
|||||||
_try_claude_ai_json,
|
_try_claude_ai_json,
|
||||||
_try_claude_code_jsonl,
|
_try_claude_code_jsonl,
|
||||||
_try_codex_jsonl,
|
_try_codex_jsonl,
|
||||||
|
_try_gemini_jsonl,
|
||||||
_try_normalize_json,
|
_try_normalize_json,
|
||||||
_try_slack_json,
|
_try_slack_json,
|
||||||
normalize,
|
normalize,
|
||||||
@@ -450,6 +451,168 @@ def test_codex_jsonl_payload_not_dict():
|
|||||||
assert result is not None
|
assert result is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── _try_gemini_jsonl ──────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
|
||||||
|
# JSONL. The schema (per google-gemini/gemini-cli#15292):
|
||||||
|
#
|
||||||
|
# {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
|
||||||
|
# {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
|
||||||
|
# {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
|
||||||
|
# {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
|
||||||
|
#
|
||||||
|
# Detection requires a ``session_metadata`` record so this parser does
|
||||||
|
# not false-positive against Claude Code or Codex JSONL. ``message_update``
|
||||||
|
# entries (token-count deltas only) are skipped — they carry no message
|
||||||
|
# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
|
||||||
|
# all text blocks for a given message.
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_valid():
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
|
||||||
|
json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
|
||||||
|
json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "> Hello" in result
|
||||||
|
assert "Hi there" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_multi_turn():
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata", "sessionId": "s"}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "> Q1" in result
|
||||||
|
assert "A1" in result
|
||||||
|
assert "> Q2" in result
|
||||||
|
assert "A2" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_no_session_metadata():
|
||||||
|
"""Without session_metadata, parser returns None — guards against false
|
||||||
|
positives on Claude Code / Codex JSONL passed through the dispatch chain."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_skips_message_update():
|
||||||
|
"""message_update records carry only token counts — must be ignored,
|
||||||
|
not turned into empty drawers or duplicated assistant turns."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata"}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
|
||||||
|
json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "tokens" not in result
|
||||||
|
assert "input" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_too_few_messages():
|
||||||
|
"""Mirror codex/claude_code behavior: < 2 conversational messages = None."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata"}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_multi_block_content():
|
||||||
|
"""A single message can have multiple text blocks in its content array
|
||||||
|
(e.g. a thinking block + a final answer). Both should be concatenated
|
||||||
|
into one transcript turn, in order."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata"}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "gemini",
|
||||||
|
"content": [{"text": "First part."}, {"text": "Second part."}],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "First part." in result
|
||||||
|
assert "Second part." in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_empty_content_skipped():
|
||||||
|
"""A message whose content array yields no text should be skipped, not
|
||||||
|
emit an empty turn that would corrupt the transcript."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata"}),
|
||||||
|
json.dumps({"type": "user", "content": []}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "> real Q" in result
|
||||||
|
assert "real A" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_invalid_json_lines_skipped():
|
||||||
|
"""A malformed line in the middle of the stream must not abort parsing —
|
||||||
|
the rest of the session should still produce a transcript."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_metadata"}),
|
||||||
|
"not-valid-json{",
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "> Q" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_does_not_match_codex():
|
||||||
|
"""Codex JSONL passed in must NOT be parsed by the gemini adapter — the
|
||||||
|
dispatch chain in _try_normalize_json relies on each adapter returning
|
||||||
|
None when it doesn't recognize a format."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session_meta", "payload": {}}),
|
||||||
|
json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
|
||||||
|
json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_gemini_jsonl_messages_before_session_metadata_discarded():
|
||||||
|
"""user/gemini turns that appear before the session_metadata sentinel must
|
||||||
|
be silently discarded, not counted as conversational messages. Only turns
|
||||||
|
after the sentinel contribute to the transcript."""
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}),
|
||||||
|
json.dumps({"type": "session_metadata", "sessionId": "s"}),
|
||||||
|
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
|
||||||
|
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
|
||||||
|
]
|
||||||
|
result = _try_gemini_jsonl("\n".join(lines))
|
||||||
|
assert result is not None
|
||||||
|
assert "preamble Q" not in result
|
||||||
|
assert "preamble A" not in result
|
||||||
|
assert "> real Q" in result
|
||||||
|
assert "real A" in result
|
||||||
|
|
||||||
|
|
||||||
# ── _try_claude_ai_json ───────────────────────────────────────────────
|
# ── _try_claude_ai_json ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user