From 87e8bafad82420e9c2f69b290107f3b5229aa9ca Mon Sep 17 00:00:00 2001
From: Mikhail Valentsev <michael@valentsev.ru>
Date: Mon, 13 Apr 2026 02:25:34 +0500
Subject: [PATCH] fix: prevent convo_miner from re-processing 0-chunk files on
 every run (#654) (#732)

* fix: register 0-chunk files to prevent re-processing on every mine (#654)

mine_convos() has three early-exit paths (OSError, content too short,
zero chunks) that skip writing anything to ChromaDB. Since
file_already_mined() checks for the presence of a document with a
matching source_file, these files are re-read and re-processed on
every subsequent run.

Add _register_file() that upserts a lightweight sentinel document
(room="_registry", ingest_mode="registry") so file_already_mined()
returns True on future runs.

Note: Bug 2 from the issue (drawers_added counter always 0) was
already resolved upstream via the switch from collection.add() to
collection.upsert().

* fix: resolve macOS path symlink in test + remove unused variable
---
 mempalace/convo_miner.py  | 30 +++++++++++++++++++++++
 tests/test_convo_miner.py | 51 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
index bdec5f1..02419ba 100644
--- a/mempalace/convo_miner.py
+++ b/mempalace/convo_miner.py
@@ -32,6 +32,30 @@ CHUNK_SIZE = 800  # chars per drawer — align with miner.py
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB — skip files larger than this
 
 
+def _register_file(collection, source_file: str, wing: str, agent: str):
+    """Write a sentinel so file_already_mined() returns True for 0-chunk files.
+
+    Without this, files that normalize to nothing or produce zero chunks are
+    re-read and re-processed on every mine run because nothing was written to
+    ChromaDB on the first pass.
+    """
+    sentinel_id = f"_reg_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
+    collection.upsert(
+        documents=[f"[registry] {source_file}"],
+        ids=[sentinel_id],
+        metadatas=[
+            {
+                "wing": wing,
+                "room": "_registry",
+                "source_file": source_file,
+                "added_by": agent,
+                "filed_at": datetime.now().isoformat(),
+                "ingest_mode": "registry",
+            }
+        ],
+    )
+
+
 # =============================================================================
 # CHUNKING — exchange pairs for conversations
 # =============================================================================
@@ -305,9 +329,13 @@ def mine_convos(
         try:
             content = normalize(str(filepath))
         except (OSError, ValueError):
+            if not dry_run:
+                _register_file(collection, source_file, wing, agent)
             continue
 
         if not content or len(content.strip()) < MIN_CHUNK_SIZE:
+            if not dry_run:
+                _register_file(collection, source_file, wing, agent)
             continue
 
         # Chunk — either exchange pairs or general extraction
@@ -320,6 +348,8 @@ def mine_convos(
             chunks = chunk_exchanges(content)
 
         if not chunks:
+            if not dry_run:
+                _register_file(collection, source_file, wing, agent)
             continue
 
         # Detect room from content (general mode uses memory_type instead)
diff --git a/tests/test_convo_miner.py b/tests/test_convo_miner.py
index 0ac0019..f5074b4 100644
--- a/tests/test_convo_miner.py
+++ b/tests/test_convo_miner.py
@@ -1,8 +1,12 @@
 import os
 import tempfile
 import shutil
+from pathlib import Path
+
 import chromadb
+
 from mempalace.convo_miner import mine_convos
+from mempalace.palace import file_already_mined
 
 
 def test_convo_mining():
@@ -24,3 +28,50 @@ def test_convo_mining():
     assert len(results["documents"][0]) > 0
 
     shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_mine_convos_does_not_reprocess_short_files(capsys):
+    """Files below MIN_CHUNK_SIZE get a sentinel so they are skipped on re-run."""
+    tmpdir = tempfile.mkdtemp()
+    try:
+        # A file too short to produce any chunks
+        with open(os.path.join(tmpdir, "tiny.txt"), "w") as f:
+            f.write("hi")
+
+        palace_path = os.path.join(tmpdir, "palace")
+
+        # First run -- file is processed (sentinel written)
+        mine_convos(tmpdir, palace_path, wing="test")
+        capsys.readouterr()  # drain output
+
+        # Verify sentinel was written (resolve path -- macOS /var -> /private/var)
+        resolved_file = str(Path(tmpdir).resolve() / "tiny.txt")
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_collection("mempalace_drawers")
+        assert file_already_mined(col, resolved_file)
+
+        # Second run -- file should be skipped
+        mine_convos(tmpdir, palace_path, wing="test")
+        out2 = capsys.readouterr().out
+        assert "Files skipped (already filed): 1" in out2
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
+    """Files that normalize but produce 0 exchange chunks get a sentinel."""
+    tmpdir = tempfile.mkdtemp()
+    try:
+        # Content long enough to pass MIN_CHUNK_SIZE but with no exchange markers
+        # (no "> " lines), so chunk_exchanges returns []
+        with open(os.path.join(tmpdir, "no_exchanges.txt"), "w") as f:
+            f.write("This is a plain paragraph without any exchange markers. " * 5)
+
+        palace_path = os.path.join(tmpdir, "palace")
+
+        mine_convos(tmpdir, palace_path, wing="test")
+        mine_convos(tmpdir, palace_path, wing="test")
+        out2 = capsys.readouterr().out
+        assert "Files skipped (already filed): 1" in out2
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)