Raise MAX_FILE_SIZE cap from 10 MB to 500 MB

Long Claude Code sessions routinely produce transcripts larger than 10 MB. The previous cap at miner.py:65 silently dropped them at line 732 with `if filepath.stat().st_size > MAX_FILE_SIZE: continue` — same silent-failure pattern as the .jsonl extension bug. The cap exists as a safety rail against pathological binaries, not as a limit on legitimate text. Downstream chunking at 800 chars per drawer means source file size does not affect storage or embedding cost. 500 MB leaves headroom for year-long continuous transcripts while still catching accidental multi-GB binary mines. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 07:09:05 -07:00
parent 560fdbdc9f
commit d137d12313
2 changed files with 61 additions and 2 deletions
@@ -63,7 +63,11 @@ SKIP_FILENAMES = {
 CHUNK_SIZE = 800  # chars per drawer
 CHUNK_OVERLAP = 100  # overlap between chunks
 MIN_CHUNK_SIZE = 50  # skip tiny chunks
-MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB — skip files larger than this
+MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
 # Long Claude Code sessions and large transcript exports routinely exceed
 # 10 MB. The cap exists as a defensive rail against pathological binary
 # files, not as a limit on legitimate text. Chunking at 800 chars per
 # drawer means source size does not affect storage or embedding cost.
 # =============================================================================
@@ -23,8 +23,9 @@ Written BEFORE the fix.
 import tempfile
 from pathlib import Path
 from unittest.mock import patch
-from mempalace.miner import READABLE_EXTENSIONS, scan_project
+from mempalace.miner import MAX_FILE_SIZE, READABLE_EXTENSIONS, scan_project
 class TestJsonlNotSilentlySkipped:
@@ -67,3 +68,57 @@ class TestJsonlNotSilentlySkipped:
                f"Returned: {found_names}. Users placing transcript "
                "exports in a project directory expect them to be mined."
            )
    def test_large_jsonl_not_silently_dropped_by_size_cap(self):
        """Long sessions produce >10 MB transcripts. They must still mine.
        The legacy cap was 10 MB, which is smaller than a long Claude Code
        session's transcript. Users hitting the cap lost their entire
        conversation to a silent `if size > MAX: continue` at miner.py:732.
        Raise the cap well above any realistic transcript size.
        """
        # 10 MB cap was silent failure — real Claude Code long sessions
        # exceed this. The cap must accommodate them.
        assert MAX_FILE_SIZE >= 100 * 1024 * 1024, (
            f"MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes "
            f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Long Claude Code "
            "sessions produce transcripts larger than 10 MB and get "
            "silently dropped. Raise to at least 100 MB — chunking "
            "at 800 chars per drawer means source file size doesn't "
            "matter for downstream storage."
        )
    def test_scan_project_picks_up_50mb_jsonl(self):
        """A 50 MB .jsonl must not be filtered out by the size cap.
        We don't actually write 50 MB (slow test). Instead, we mock
        stat().st_size to report a 50 MB file and confirm scan_project
        still includes it.
        """
        with tempfile.TemporaryDirectory() as tmp:
            tmpdir = Path(tmp)
            big_jsonl = tmpdir / "big_transcript.jsonl"
            # Write a small real file so the existence / extension / text
            # checks pass; then mock its reported size.
            big_jsonl.write_text('{"role": "user", "content": "hi"}\n')
            fake_size = 50 * 1024 * 1024  # 50 MB
            real_stat = Path.stat
            def fake_stat(self, *args, **kwargs):
                result = real_stat(self, *args, **kwargs)
                if self.name == "big_transcript.jsonl":
                    class _FakeStat:
                        st_size = fake_size
                        st_mode = result.st_mode
                    return _FakeStat()
                return result
            with patch.object(Path, "stat", fake_stat):
                found = scan_project(str(tmpdir))
            found_names = [p.name for p in found]
            assert "big_transcript.jsonl" in found_names, (
                f"50 MB .jsonl was dropped by size cap (MAX_FILE_SIZE="
                f"{MAX_FILE_SIZE}). Returned: {found_names}."
            )