Raise MAX_FILE_SIZE cap from 10 MB to 500 MB

Long Claude Code sessions routinely produce transcripts larger than 10
MB. The previous cap at miner.py:65 silently dropped them at line 732
with `if filepath.stat().st_size > MAX_FILE_SIZE: continue` — same
silent-failure pattern as the .jsonl extension bug.

The cap exists as a safety rail against pathological binaries, not as
a limit on legitimate text. Downstream chunking at 800 chars per drawer
means source file size does not affect storage or embedding cost.

500 MB leaves headroom for year-long continuous transcripts while still
catching accidental multi-GB binary mines.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MSL
2026-04-18 07:09:05 -07:00
committed by Igor Lins e Silva
parent 560fdbdc9f
commit d137d12313
2 changed files with 61 additions and 2 deletions
+5 -1
View File
@@ -63,7 +63,11 @@ SKIP_FILENAMES = {
CHUNK_SIZE = 800 # chars per drawer CHUNK_SIZE = 800 # chars per drawer
CHUNK_OVERLAP = 100 # overlap between chunks CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks MIN_CHUNK_SIZE = 50 # skip tiny chunks
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# Long Claude Code sessions and large transcript exports routinely exceed
# 10 MB. The cap exists as a defensive rail against pathological binary
# files, not as a limit on legitimate text. Chunking at 800 chars per
# drawer means source size does not affect storage or embedding cost.
# ============================================================================= # =============================================================================
+56 -1
View File
@@ -23,8 +23,9 @@ Written BEFORE the fix.
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from unittest.mock import patch
from mempalace.miner import READABLE_EXTENSIONS, scan_project from mempalace.miner import MAX_FILE_SIZE, READABLE_EXTENSIONS, scan_project
class TestJsonlNotSilentlySkipped: class TestJsonlNotSilentlySkipped:
@@ -67,3 +68,57 @@ class TestJsonlNotSilentlySkipped:
f"Returned: {found_names}. Users placing transcript " f"Returned: {found_names}. Users placing transcript "
"exports in a project directory expect them to be mined." "exports in a project directory expect them to be mined."
) )
def test_large_jsonl_not_silently_dropped_by_size_cap(self):
"""Long sessions produce >10 MB transcripts. They must still mine.
The legacy cap was 10 MB, which is smaller than a long Claude Code
session's transcript. Users hitting the cap lost their entire
conversation to a silent `if size > MAX: continue` at miner.py:732.
Raise the cap well above any realistic transcript size.
"""
# 10 MB cap was silent failure — real Claude Code long sessions
# exceed this. The cap must accommodate them.
assert MAX_FILE_SIZE >= 100 * 1024 * 1024, (
f"MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes "
f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Long Claude Code "
"sessions produce transcripts larger than 10 MB and get "
"silently dropped. Raise to at least 100 MB — chunking "
"at 800 chars per drawer means source file size doesn't "
"matter for downstream storage."
)
def test_scan_project_picks_up_50mb_jsonl(self):
"""A 50 MB .jsonl must not be filtered out by the size cap.
We don't actually write 50 MB (slow test). Instead, we mock
stat().st_size to report a 50 MB file and confirm scan_project
still includes it.
"""
with tempfile.TemporaryDirectory() as tmp:
tmpdir = Path(tmp)
big_jsonl = tmpdir / "big_transcript.jsonl"
# Write a small real file so the existence / extension / text
# checks pass; then mock its reported size.
big_jsonl.write_text('{"role": "user", "content": "hi"}\n')
fake_size = 50 * 1024 * 1024 # 50 MB
real_stat = Path.stat
def fake_stat(self, *args, **kwargs):
result = real_stat(self, *args, **kwargs)
if self.name == "big_transcript.jsonl":
class _FakeStat:
st_size = fake_size
st_mode = result.st_mode
return _FakeStat()
return result
with patch.object(Path, "stat", fake_stat):
found = scan_project(str(tmpdir))
found_names = [p.name for p in found]
assert "big_transcript.jsonl" in found_names, (
f"50 MB .jsonl was dropped by size cap (MAX_FILE_SIZE="
f"{MAX_FILE_SIZE}). Returned: {found_names}."
)