* fix: register 0-chunk files to prevent re-processing on every mine (#654) mine_convos() has three early-exit paths (OSError, content too short, zero chunks) that skip writing anything to ChromaDB. Since file_already_mined() checks for the presence of a document with a matching source_file, these files are re-read and re-processed on every subsequent run. Add _register_file() that upserts a lightweight sentinel document (room="_registry", ingest_mode="registry") so file_already_mined() returns True on future runs. Note: Bug 2 from the issue (drawers_added counter always 0) was already resolved upstream via the switch from collection.add() to collection.upsert(). * fix: resolve macOS path symlink in test + remove unused variable
This commit is contained in:
committed by
GitHub
parent
9b60c6edd7
commit
87e8bafad8
@@ -32,6 +32,30 @@ CHUNK_SIZE = 800 # chars per drawer — align with miner.py
|
||||
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
||||
|
||||
|
||||
def _register_file(collection, source_file: str, wing: str, agent: str):
|
||||
"""Write a sentinel so file_already_mined() returns True for 0-chunk files.
|
||||
|
||||
Without this, files that normalize to nothing or produce zero chunks are
|
||||
re-read and re-processed on every mine run because nothing was written to
|
||||
ChromaDB on the first pass.
|
||||
"""
|
||||
sentinel_id = f"_reg_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
||||
collection.upsert(
|
||||
documents=[f"[registry] {source_file}"],
|
||||
ids=[sentinel_id],
|
||||
metadatas=[
|
||||
{
|
||||
"wing": wing,
|
||||
"room": "_registry",
|
||||
"source_file": source_file,
|
||||
"added_by": agent,
|
||||
"filed_at": datetime.now().isoformat(),
|
||||
"ingest_mode": "registry",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CHUNKING — exchange pairs for conversations
|
||||
# =============================================================================
|
||||
@@ -305,9 +329,13 @@ def mine_convos(
|
||||
try:
|
||||
content = normalize(str(filepath))
|
||||
except (OSError, ValueError):
|
||||
if not dry_run:
|
||||
_register_file(collection, source_file, wing, agent)
|
||||
continue
|
||||
|
||||
if not content or len(content.strip()) < MIN_CHUNK_SIZE:
|
||||
if not dry_run:
|
||||
_register_file(collection, source_file, wing, agent)
|
||||
continue
|
||||
|
||||
# Chunk — either exchange pairs or general extraction
|
||||
@@ -320,6 +348,8 @@ def mine_convos(
|
||||
chunks = chunk_exchanges(content)
|
||||
|
||||
if not chunks:
|
||||
if not dry_run:
|
||||
_register_file(collection, source_file, wing, agent)
|
||||
continue
|
||||
|
||||
# Detect room from content (general mode uses memory_type instead)
|
||||
|
||||
Reference in New Issue
Block a user