The _chunk_by_exchange() function was silently truncating AI responses to 8 lines via ai_lines[:8]. Any content beyond line 8 was discarded, violating the project's verbatim storage principle. Now the full AI response is preserved. When a combined exchange exceeds CHUNK_SIZE (800 chars, aligned with miner.py), it is split across consecutive drawers instead of being truncated.
This commit is contained in:
@@ -28,6 +28,7 @@ CONVO_EXTENSIONS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
MIN_CHUNK_SIZE = 30
|
MIN_CHUNK_SIZE = 30
|
||||||
|
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
|
||||||
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
||||||
|
|
||||||
|
|
||||||
@@ -51,7 +52,12 @@ def chunk_exchanges(content: str) -> list:
|
|||||||
|
|
||||||
|
|
||||||
def _chunk_by_exchange(lines: list) -> list:
|
def _chunk_by_exchange(lines: list) -> list:
|
||||||
"""One user turn (>) + the AI response that follows = one chunk."""
|
"""One user turn (>) + the AI response that follows = one or more chunks.
|
||||||
|
|
||||||
|
The full AI response is preserved verbatim. When the combined
|
||||||
|
user-turn + response exceeds CHUNK_SIZE the response is split across
|
||||||
|
consecutive drawers so nothing is silently discarded.
|
||||||
|
"""
|
||||||
chunks = []
|
chunks = []
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
@@ -73,7 +79,24 @@ def _chunk_by_exchange(lines: list) -> list:
|
|||||||
ai_response = " ".join(ai_lines)
|
ai_response = " ".join(ai_lines)
|
||||||
content = f"{user_turn}\n{ai_response}" if ai_response else user_turn
|
content = f"{user_turn}\n{ai_response}" if ai_response else user_turn
|
||||||
|
|
||||||
if len(content.strip()) > MIN_CHUNK_SIZE:
|
# Split into multiple drawers when the exchange exceeds CHUNK_SIZE
|
||||||
|
if len(content) > CHUNK_SIZE:
|
||||||
|
# First chunk: user turn + as much response as fits
|
||||||
|
first_part = content[:CHUNK_SIZE]
|
||||||
|
if len(first_part.strip()) > MIN_CHUNK_SIZE:
|
||||||
|
chunks.append(
|
||||||
|
{"content": first_part, "chunk_index": len(chunks)}
|
||||||
|
)
|
||||||
|
# Remaining response in CHUNK_SIZE-sized continuation drawers
|
||||||
|
remainder = content[CHUNK_SIZE:]
|
||||||
|
while remainder:
|
||||||
|
part = remainder[:CHUNK_SIZE]
|
||||||
|
remainder = remainder[CHUNK_SIZE:]
|
||||||
|
if len(part.strip()) > MIN_CHUNK_SIZE:
|
||||||
|
chunks.append(
|
||||||
|
{"content": part, "chunk_index": len(chunks)}
|
||||||
|
)
|
||||||
|
elif len(content.strip()) > MIN_CHUNK_SIZE:
|
||||||
chunks.append(
|
chunks.append(
|
||||||
{
|
{
|
||||||
"content": content,
|
"content": content,
|
||||||
|
|||||||
Reference in New Issue
Block a user