Raise MAX_FILE_SIZE cap from 10 MB to 500 MB
Long Claude Code sessions routinely produce transcripts larger than 10 MB. The previous cap at miner.py:65 silently dropped them at line 732 with `if filepath.stat().st_size > MAX_FILE_SIZE: continue` — same silent-failure pattern as the .jsonl extension bug. The cap exists as a safety rail against pathological binaries, not as a limit on legitimate text. Downstream chunking at 800 chars per drawer means source file size does not affect storage or embedding cost. 500 MB leaves headroom for year-long continuous transcripts while still catching accidental multi-GB binary mines. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+5
-1
@@ -63,7 +63,11 @@ SKIP_FILENAMES = {
|
|||||||
CHUNK_SIZE = 800 # chars per drawer
|
CHUNK_SIZE = 800 # chars per drawer
|
||||||
CHUNK_OVERLAP = 100 # overlap between chunks
|
CHUNK_OVERLAP = 100 # overlap between chunks
|
||||||
MIN_CHUNK_SIZE = 50 # skip tiny chunks
|
MIN_CHUNK_SIZE = 50 # skip tiny chunks
|
||||||
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
|
||||||
|
# Long Claude Code sessions and large transcript exports routinely exceed
|
||||||
|
# 10 MB. The cap exists as a defensive rail against pathological binary
|
||||||
|
# files, not as a limit on legitimate text. Chunking at 800 chars per
|
||||||
|
# drawer means source size does not affect storage or embedding cost.
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -23,8 +23,9 @@ Written BEFORE the fix.
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
from mempalace.miner import READABLE_EXTENSIONS, scan_project
|
from mempalace.miner import MAX_FILE_SIZE, READABLE_EXTENSIONS, scan_project
|
||||||
|
|
||||||
|
|
||||||
class TestJsonlNotSilentlySkipped:
|
class TestJsonlNotSilentlySkipped:
|
||||||
@@ -67,3 +68,57 @@ class TestJsonlNotSilentlySkipped:
|
|||||||
f"Returned: {found_names}. Users placing transcript "
|
f"Returned: {found_names}. Users placing transcript "
|
||||||
"exports in a project directory expect them to be mined."
|
"exports in a project directory expect them to be mined."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_large_jsonl_not_silently_dropped_by_size_cap(self):
|
||||||
|
"""Long sessions produce >10 MB transcripts. They must still mine.
|
||||||
|
|
||||||
|
The legacy cap was 10 MB, which is smaller than a long Claude Code
|
||||||
|
session's transcript. Users hitting the cap lost their entire
|
||||||
|
conversation to a silent `if size > MAX: continue` at miner.py:732.
|
||||||
|
Raise the cap well above any realistic transcript size.
|
||||||
|
"""
|
||||||
|
# 10 MB cap was silent failure — real Claude Code long sessions
|
||||||
|
# exceed this. The cap must accommodate them.
|
||||||
|
assert MAX_FILE_SIZE >= 100 * 1024 * 1024, (
|
||||||
|
f"MAX_FILE_SIZE is {MAX_FILE_SIZE} bytes "
|
||||||
|
f"({MAX_FILE_SIZE / 1024 / 1024:.0f} MB). Long Claude Code "
|
||||||
|
"sessions produce transcripts larger than 10 MB and get "
|
||||||
|
"silently dropped. Raise to at least 100 MB — chunking "
|
||||||
|
"at 800 chars per drawer means source file size doesn't "
|
||||||
|
"matter for downstream storage."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_scan_project_picks_up_50mb_jsonl(self):
|
||||||
|
"""A 50 MB .jsonl must not be filtered out by the size cap.
|
||||||
|
|
||||||
|
We don't actually write 50 MB (slow test). Instead, we mock
|
||||||
|
stat().st_size to report a 50 MB file and confirm scan_project
|
||||||
|
still includes it.
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmpdir = Path(tmp)
|
||||||
|
big_jsonl = tmpdir / "big_transcript.jsonl"
|
||||||
|
# Write a small real file so the existence / extension / text
|
||||||
|
# checks pass; then mock its reported size.
|
||||||
|
big_jsonl.write_text('{"role": "user", "content": "hi"}\n')
|
||||||
|
fake_size = 50 * 1024 * 1024 # 50 MB
|
||||||
|
|
||||||
|
real_stat = Path.stat
|
||||||
|
|
||||||
|
def fake_stat(self, *args, **kwargs):
|
||||||
|
result = real_stat(self, *args, **kwargs)
|
||||||
|
if self.name == "big_transcript.jsonl":
|
||||||
|
class _FakeStat:
|
||||||
|
st_size = fake_size
|
||||||
|
st_mode = result.st_mode
|
||||||
|
return _FakeStat()
|
||||||
|
return result
|
||||||
|
|
||||||
|
with patch.object(Path, "stat", fake_stat):
|
||||||
|
found = scan_project(str(tmpdir))
|
||||||
|
|
||||||
|
found_names = [p.name for p in found]
|
||||||
|
assert "big_transcript.jsonl" in found_names, (
|
||||||
|
f"50 MB .jsonl was dropped by size cap (MAX_FILE_SIZE="
|
||||||
|
f"{MAX_FILE_SIZE}). Returned: {found_names}."
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user