diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py index ce25d1e..bdec5f1 100644 --- a/mempalace/convo_miner.py +++ b/mempalace/convo_miner.py @@ -28,6 +28,7 @@ CONVO_EXTENSIONS = { } MIN_CHUNK_SIZE = 30 +CHUNK_SIZE = 800 # chars per drawer — align with miner.py MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this @@ -51,7 +52,12 @@ def chunk_exchanges(content: str) -> list: def _chunk_by_exchange(lines: list) -> list: - """One user turn (>) + the AI response that follows = one chunk.""" + """One user turn (>) + the AI response that follows = one or more chunks. + + The full AI response is preserved verbatim. When the combined + user-turn + response exceeds CHUNK_SIZE the response is split across + consecutive drawers so nothing is silently discarded. + """ chunks = [] i = 0 @@ -73,7 +79,24 @@ def _chunk_by_exchange(lines: list) -> list: ai_response = " ".join(ai_lines) content = f"{user_turn}\n{ai_response}" if ai_response else user_turn - if len(content.strip()) > MIN_CHUNK_SIZE: + # Split into multiple drawers when the exchange exceeds CHUNK_SIZE + if len(content) > CHUNK_SIZE: + # First chunk: user turn + as much response as fits + first_part = content[:CHUNK_SIZE] + if len(first_part.strip()) > MIN_CHUNK_SIZE: + chunks.append( + {"content": first_part, "chunk_index": len(chunks)} + ) + # Remaining response in CHUNK_SIZE-sized continuation drawers + remainder = content[CHUNK_SIZE:] + while remainder: + part = remainder[:CHUNK_SIZE] + remainder = remainder[CHUNK_SIZE:] + if len(part.strip()) > MIN_CHUNK_SIZE: + chunks.append( + {"content": part, "chunk_index": len(chunks)} + ) + elif len(content.strip()) > MIN_CHUNK_SIZE: chunks.append( { "content": content,