diff --git a/mempalace/miner.py b/mempalace/miner.py index f0177fa..4e809a8 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -63,7 +63,11 @@ SKIP_FILENAMES = { CHUNK_SIZE = 800 # chars per drawer CHUNK_OVERLAP = 100 # overlap between chunks MIN_CHUNK_SIZE = 50 # skip tiny chunks -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this +MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this. +# Long Claude Code sessions and large transcript exports routinely exceed +# 10 MB. The cap exists as a defensive rail against pathological binary +# files, not as a limit on legitimate text. Chunking at 800 chars per +# drawer means source size does not affect storage or embedding cost. # ============================================================================= diff --git a/tests/test_miner_jsonl_visibility.py b/tests/test_miner_jsonl_visibility.py index 6d24670..2ef3e18 100644 --- a/tests/test_miner_jsonl_visibility.py +++ b/tests/test_miner_jsonl_visibility.py @@ -23,8 +23,9 @@ Written BEFORE the fix. import tempfile from pathlib import Path +from unittest.mock import patch -from mempalace.miner import READABLE_EXTENSIONS, scan_project +from mempalace.miner import MAX_FILE_SIZE, READABLE_EXTENSIONS, scan_project class TestJsonlNotSilentlySkipped: @@ -67,3 +68,57 @@ class TestJsonlNotSilentlySkipped: f"Returned: {found_names}. Users placing transcript " "exports in a project directory expect them to be mined." ) + + def test_large_jsonl_not_silently_dropped_by_size_cap(self): + """Long sessions produce >10 MB transcripts. They must still mine. + + The legacy cap was 10 MB, which is smaller than a long Claude Code + session's transcript. Users hitting the cap lost their entire + conversation to a silent `if size > MAX: continue` at miner.py:732. + Raise the cap well above any realistic transcript size. + """ + # 10 MB cap was silent failure — real Claude Code long sessions + # exceed this. The cap must accommodate them. + assert MAX_FILE_SIZE >= 100 * 1024 * 1024, ( + f"MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes " + f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Long Claude Code " + "sessions produce transcripts larger than 10 MB and get " + "silently dropped. Raise to at least 100 MB — chunking " + "at 800 chars per drawer means source file size doesn't " + "matter for downstream storage." + ) + + def test_scan_project_picks_up_50mb_jsonl(self): + """A 50 MB .jsonl must not be filtered out by the size cap. + + We don't actually write 50 MB (slow test). Instead, we mock + stat().st_size to report a 50 MB file and confirm scan_project + still includes it. + """ + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + big_jsonl = tmpdir / "big_transcript.jsonl" + # Write a small real file so the existence / extension / text + # checks pass; then mock its reported size. + big_jsonl.write_text('{"role": "user", "content": "hi"}\n') + fake_size = 50 * 1024 * 1024 # 50 MB + + real_stat = Path.stat + + def fake_stat(self, *args, **kwargs): + result = real_stat(self, *args, **kwargs) + if self.name == "big_transcript.jsonl": + class _FakeStat: + st_size = fake_size + st_mode = result.st_mode + return _FakeStat() + return result + + with patch.object(Path, "stat", fake_stat): + found = scan_project(str(tmpdir)) + + found_names = [p.name for p in found] + assert "big_transcript.jsonl" in found_names, ( + f"50 MB .jsonl was dropped by size cap (MAX_FILE_SIZE=" + f"{MAX_FILE_SIZE}). Returned: {found_names}." + )