From 4949aab68b71f540514b3a7aad45828fa6366f5a Mon Sep 17 00:00:00 2001
From: eldar702 <eldarshlomi7@gmail.com>
Date: Sun, 19 Apr 2026 11:13:50 +0300
Subject: [PATCH 01/60] fix: guard None metadata/doc in tool_check_duplicate
 and Layer1/Layer2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
lists of a query/get result for partially-flushed rows. The codebase
already has a systemic None-guard pattern (merged #999, #1013, #1019)
but three call sites were still unguarded:

* ``mcp_server.tool_check_duplicate`` (``mcp_server.py:487-488``) —
  ``meta = results["metadatas"][0][i]`` followed by ``meta.get(...)``
  raises ``AttributeError: 'NoneType' object has no attribute 'get'``.
  The broad ``except Exception`` wrapper (line 504) swallows it and
  returns an uninformative ``"Duplicate check failed"``.

* ``layers.Layer1.generate`` (``layers.py:126``) — iterates
  ``zip(docs, metas)`` and calls ``meta.get(key)`` in the importance
  loop. A single None metadata blows up the entire wake-up render.

* ``layers.Layer2.retrieve`` (``layers.py:224``) — same pattern, same
  crash path for the on-demand render.

Apply the same ``meta = meta or {}`` / ``doc = doc or ""`` idiom used
by the merged guards in the search path. Three-line additions, no
behaviour change on well-formed results.

Tests added:

* ``test_check_duplicate_handles_none_metadata`` — mocks the collection
  query to return ``None`` for one metadata and document, asserts the
  call does not crash and the sentinel-rendered entry has wing/room "?"
  and empty content.
* ``test_layer1_handles_none_metadata`` / ``_handles_none_document``
* ``test_layer2_handles_none_metadata``

Relationship to other open PRs:

* **#1019** guarded ``searcher.py`` loops. This PR extends the same
  guard to the three call sites #1019 did not touch.
* **#979** fixed ``tool_check_duplicate`` negative similarity but left
  the None-metadata path unguarded.
* Does not overlap **#1013** (``Layer3.search_raw``) or **#999**.
---
 mempalace/layers.py      |  4 +++
 mempalace/mcp_server.py  |  6 ++--
 tests/test_layers.py     | 69 ++++++++++++++++++++++++++++++++++++++++
 tests/test_mcp_server.py | 36 +++++++++++++++++++++
 4 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/mempalace/layers.py b/mempalace/layers.py
index a0f9b6d..b20c656 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -124,6 +124,8 @@ class Layer1:
         # Score each drawer: prefer high importance, recent filing
         scored = []
         for doc, meta in zip(docs, metas):
+            meta = meta or {}
+            doc = doc or ""
             importance = 3
             # Try multiple metadata keys that might carry weight info
             for key in ("importance", "emotional_weight", "weight"):
@@ -222,6 +224,8 @@ class Layer2:
 
         lines = [f"## L2 — ON-DEMAND ({len(docs)} drawers)"]
         for doc, meta in zip(docs[:n_results], metas[:n_results]):
+            meta = meta or {}
+            doc = doc or ""
             room_name = meta.get("room", "?")
             source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
             snippet = doc.strip().replace("\n", " ")
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 06355c4..ae1eb71 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -484,8 +484,10 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
                 dist = results["distances"][0][i]
                 similarity = round(1 - dist, 3)
                 if similarity >= threshold:
-                    meta = results["metadatas"][0][i]
-                    doc = results["documents"][0][i]
+                    # Chroma 1.5.x can return None for partially-flushed rows;
+                    # coerce to empty sentinels so downstream .get() is safe.
+                    meta = results["metadatas"][0][i] or {}
+                    doc = results["documents"][0][i] or ""
                     duplicates.append(
                         {
                             "id": drawer_id,
diff --git a/tests/test_layers.py b/tests/test_layers.py
index 575183f..d4c54ce 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -655,3 +655,72 @@ def test_memory_stack_status_with_palace(tmp_path):
 
     assert result["total_drawers"] == 42
     assert result["L0_identity"]["exists"] is True
+
+
+# ── Layer1 / Layer2 None-metadata guards ───────────────────────────────
+#
+# Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
+# lists for partially-flushed rows. The Layer1.generate() and
+# Layer2.retrieve() loops previously called ``meta.get(...)`` without
+# coercing, raising ``AttributeError: 'NoneType' object has no attribute
+# 'get'`` and blowing up the whole wake-up render. These tests guard that
+# the loops tolerate the None entries and render the rest of the result.
+
+
+def test_layer1_handles_none_metadata():
+    """Layer1.generate tolerates None entries in the metadatas list."""
+    docs = ["important memory", "another memory"]
+    metas = [{"room": "decisions", "source_file": "a.txt"}, None]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.generate()
+
+    assert "ESSENTIAL STORY" in result
+    assert "important memory" in result
+
+
+def test_layer1_handles_none_document():
+    """Layer1.generate tolerates None entries in the documents list."""
+    docs = ["first doc", None]
+    metas = [
+        {"room": "r", "source_file": "a.txt"},
+        {"room": "r", "source_file": "b.txt"},
+    ]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        result = layer.generate()
+
+    assert result  # Render succeeded despite the None document.
+
+
+def test_layer2_handles_none_metadata():
+    """Layer2.retrieve tolerates None entries in the metadatas list."""
+    mock_col = MagicMock()
+    mock_col.get.return_value = {
+        "documents": ["first doc", "second doc"],
+        "metadatas": [{"room": "r", "source_file": "a.txt"}, None],
+    }
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer2(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.retrieve()
+
+    assert "L2 — ON-DEMAND" in result
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 899e6a7..e376f43 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -9,6 +9,7 @@ via monkeypatch to avoid touching real data.
 from datetime import datetime
 import json
 import sys
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -495,6 +496,41 @@ class TestWriteTools:
         result = tool_delete_drawer("nonexistent_drawer")
         assert result["success"] is False
 
+    def test_check_duplicate_handles_none_metadata(self, monkeypatch, config, kg):
+        """tool_check_duplicate must tolerate None entries in the result lists
+        that ChromaDB 1.5.x returns for partially-flushed rows.
+
+        Previously ``meta = results["metadatas"][0][i]`` was unguarded and
+        raised ``AttributeError: 'NoneType' object has no attribute 'get'``
+        the moment the first matching drawer came back with None metadata —
+        surfacing to the MCP client as the uninformative
+        ``"Duplicate check failed"`` because the broad ``except Exception``
+        wrapper swallows the real cause.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "ids": [["d1", "d2"]],
+            "distances": [[0.05, 0.05]],
+            "metadatas": [[{"wing": "w", "room": "r"}, None]],
+            "documents": [["first doc", None]],
+        }
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: mock_col)
+
+        result = mcp_server.tool_check_duplicate("any content", threshold=0.5)
+
+        # Both entries land in matches (above threshold), None ones rendered
+        # with sentinel values rather than crashing the whole response.
+        assert result.get("is_duplicate") is True
+        assert len(result["matches"]) == 2
+        # The None-metadata entry falls back to sentinels.
+        none_entry = result["matches"][1]
+        assert none_entry["wing"] == "?"
+        assert none_entry["room"] == "?"
+        assert none_entry["content"] == ""
+
     def test_check_duplicate(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
         from mempalace.mcp_server import tool_check_duplicate

From 35b033d77ff378b9ff2999bff965d639e0672039 Mon Sep 17 00:00:00 2001
From: alonehobo <sergey.martemyanov@gmail.com>
Date: Tue, 21 Apr 2026 12:33:58 +0500
Subject: [PATCH 02/60] fix(mcp): force UTF-8 on stdio to fix -32000 on
 non-ASCII payloads

On Windows, Python defaults sys.stdin/sys.stdout to the system codepage
(e.g. cp1251 on Russian locales, cp1252 on Western European), while MCP
JSON-RPC is always UTF-8. Non-ASCII payloads (Cyrillic, CJK, accented
European) get mis-decoded before reaching handlers, causing json.loads
to fail or tool handlers to receive garbled strings. Both surface to
the client as a generic MCP error -32000.

Reproduction:
  1. On Windows with a non-Latin locale, call mempalace_add_drawer or
     mempalace_kg_add with Cyrillic/CJK in content or KG object.
  2. Server returns: MCP error -32000: Internal tool error.
  3. Calling the handler directly from Python works fine -- the bug is
     purely in the stdio transport.

Fix:
  Reconfigure stdin/stdout to UTF-8 at the start of main(), after
  _restore_stdout(). Uses errors="replace" defensively so a lone bad
  byte cannot take down the server. Guarded by hasattr(reconfigure)
  for exotic stream replacements.

This matches the behaviour of PYTHONUTF8=1 / python -X utf8 without
requiring users to set an env var.
---
 mempalace/mcp_server.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 6fe8225..48e8031 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1689,6 +1689,16 @@ def _restore_stdout():
 
 def main():
     _restore_stdout()
+    # Force UTF-8 on stdio. MCP JSON-RPC is UTF-8, but Python on Windows
+    # defaults stdin/stdout to the system codepage (e.g. cp1251), which
+    # corrupts non-ASCII payloads and surfaces as generic -32000 errors on
+    # Cyrillic/CJK content. See PEP 540.
+    for stream in (sys.stdin, sys.stdout):
+        if hasattr(stream, "reconfigure"):
+            try:
+                stream.reconfigure(encoding="utf-8", errors="replace")
+            except (AttributeError, OSError):
+                pass
     logger.info("MemPalace MCP Server starting...")
     while True:
         try:

From c2e053176cf4194fedde2a3361c2785e16426008 Mon Sep 17 00:00:00 2001
From: Sathvik-1007 <spacetime1007@gmail.com>
Date: Thu, 23 Apr 2026 01:40:38 +0530
Subject: [PATCH 03/60] fix: add total count to tool_list_drawers pagination
 response
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The list_drawers response only included count (current page size) with
no total field, making it impossible for callers to know when pagination
is exhausted. A page returning count == limit is ambiguous — it could
be the last exact-fit page or there could be more results.

Add a total field that reports the full number of matching drawers.
For unfiltered requests this uses col.count(); for filtered requests
(wing/room) it uses a lightweight col.get(include=[]) to count
matching IDs without fetching documents.
---
 mempalace/mcp_server.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 6fe8225..e5057af 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -746,6 +746,13 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
             kwargs["where"] = where
         result = col.get(**kwargs)
 
+        # Compute total matching drawers for pagination.
+        if where:
+            total_result = col.get(where=where, include=[])
+            total = len(total_result["ids"])
+        else:
+            total = col.count()
+
         drawers = []
         for i, did in enumerate(result["ids"]):
             meta = result["metadatas"][i]
@@ -760,6 +767,7 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
             )
         return {
             "drawers": drawers,
+            "total": total,
             "count": len(drawers),
             "offset": offset,
             "limit": limit,
@@ -1436,7 +1444,7 @@ TOOLS = {
         "handler": tool_get_drawer,
     },
     "mempalace_list_drawers": {
-        "description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, and content previews.",
+        "description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, content previews, and total matching count for pagination.",
         "input_schema": {
             "type": "object",
             "properties": {

From 0b8c2c158f1e9edbd14aac1d9b85abfd2bed97e4 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 13:00:27 +0200
Subject: [PATCH 04/60] fix(kg): reject inverted intervals in add_triple
 (valid_to < valid_from)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A triple with valid_to < valid_from satisfies neither of the temporal
filter clauses in query_entity():

    valid_from <= as_of AND valid_to >= as_of

so the triple is invisible to every query — silently corrupt. Reject
at write time with a clear error instead of letting bad data pile up
in the SQLite store.

The guard only fires when both bounds are present; open intervals
(only valid_from or only valid_to) are still accepted, and same-day
intervals (valid_from == valid_to, point-in-time facts) are explicitly
allowed.
---
 mempalace/knowledge_graph.py  |  9 +++++++++
 tests/test_knowledge_graph.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 9096ab2..30055a1 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -171,6 +171,15 @@ class KnowledgeGraph:
             add_triple("Max", "does", "swimming", valid_from="2025-01-01")
             add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
         """
+        # Reject inverted intervals: a triple with valid_to < valid_from
+        # would never satisfy `valid_from <= as_of AND valid_to >= as_of`,
+        # so it would be invisible to every query — silently corrupt.
+        if valid_from is not None and valid_to is not None and valid_to < valid_from:
+            raise ValueError(
+                f"valid_to={valid_to!r} is before valid_from={valid_from!r}; "
+                "an inverted interval would be invisible to every KG query"
+            )
+
         sub_id = self._entity_id(subject)
         obj_id = self._entity_id(obj)
         pred = predicate.lower().replace(" ", "_")
diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py
index d7d9838..6eeb8d3 100644
--- a/tests/test_knowledge_graph.py
+++ b/tests/test_knowledge_graph.py
@@ -5,6 +5,8 @@ Covers: entity CRUD, triple CRUD, temporal queries, invalidation,
 timeline, stats, and edge cases (duplicate triples, ID collisions).
 """
 
+import pytest
+
 
 class TestEntityOperations:
     def test_add_entity(self, kg):
@@ -45,6 +47,38 @@ class TestTripleOperations:
         tid2 = kg.add_triple("Alice", "works_at", "Acme")
         assert tid1 != tid2  # new triple since old one was closed
 
+    def test_add_triple_rejects_inverted_interval(self, kg):
+        # valid_to before valid_from would never satisfy
+        # `valid_from <= as_of AND valid_to >= as_of` — silently invisible
+        # to every query. Reject at write time instead.
+        with pytest.raises(ValueError, match="before valid_from"):
+            kg.add_triple(
+                "Alice",
+                "worked_at",
+                "Acme",
+                valid_from="2026-03-01",
+                valid_to="2026-02-01",
+            )
+
+    def test_add_triple_accepts_equal_dates(self, kg):
+        # Same-day intervals are valid (point-in-time facts).
+        tid = kg.add_triple(
+            "Alice",
+            "joined",
+            "Acme",
+            valid_from="2026-03-15",
+            valid_to="2026-03-15",
+        )
+        assert tid.startswith("t_alice_joined_acme_")
+
+    def test_add_triple_allows_only_one_bound(self, kg):
+        # The guard only fires when BOTH bounds are set.
+        tid1 = kg.add_triple("Alice", "knows", "Bob", valid_from="2026-01-01")
+        assert tid1.startswith("t_alice_knows_bob_")
+        kg.invalidate("Alice", "knows", "Bob", ended="2026-02-01")
+        tid2 = kg.add_triple("Alice", "knew", "Bob", valid_to="2026-03-01")
+        assert tid2.startswith("t_alice_knew_bob_")
+
 
 class TestQueries:
     def test_query_outgoing(self, seeded_kg):

From f30fdf2672f340c773d4f0554b0017abb48ed4b9 Mon Sep 17 00:00:00 2001
From: imtylervo <toanntq@gmail.com>
Date: Mon, 27 Apr 2026 14:16:20 +1000
Subject: [PATCH 05/60] fix: serialize ChromaCollection writes through palace
 lock

#976 protects `mempalace mine`, but MCP/direct backend writers still call
ChromaCollection.add/upsert/update/delete without the palace lock. This
moves the lock boundary to the Chroma backend seam so all Chroma writes
share the same palace-level serialization, with a re-entrant guard for
miner paths that already hold the lock.

mine_palace_lock(palace_path) gains a per-thread re-entrant guard
(threading.local + pid-tag against fork inheritance) so
ChromaCollection write methods can take the lock without
self-deadlocking when called from inside miner.mine()'s outer hold.

ChromaCollection.__init__ accepts an optional palace_path; when set,
add/upsert/update/delete wrap their underlying chromadb call with
mine_palace_lock(palace_path). palace_path=None preserves the legacy
no-lock behaviour for direct callers and tests. ChromaBackend's
get_collection/create_collection pass palace_path through;
mcp_server._get_collection forwards _config.palace_path so all MCP
write tools inherit the wrapping.

Tests: 5 new in tests/test_chroma_collection_lock.py covering opt-in,
writer-blocks-during-mine, re-entrant-inside-mine, two-process
serialization, and a source-level read-path-not-locked pin. Plus 1 new
+ 1 rewritten in tests/test_palace_locks.py for the re-entrant
semantics. 52 passed in 1.01s including the existing test_backends.py
regression suite.

Refs #1161.
---
 mempalace/backends/chroma.py         |  54 ++++-
 mempalace/mcp_server.py              |   4 +-
 mempalace/palace.py                  |  59 ++++-
 tests/test_chroma_collection_lock.py | 327 +++++++++++++++++++++++++++
 tests/test_palace_locks.py           |  70 +++++-
 5 files changed, 497 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_chroma_collection_lock.py

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index ad7748f..d438236 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -1,5 +1,6 @@
 """ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
 
+import contextlib
 import datetime as _dt
 import logging
 import os
@@ -573,10 +574,43 @@ def _as_list(v: Any) -> list:
 
 
 class ChromaCollection(BaseCollection):
-    """Thin adapter translating ChromaDB dict returns into typed results."""
+    """Thin adapter translating ChromaDB dict returns into typed results.
 
-    def __init__(self, collection):
+    When ``palace_path`` is set, all write methods (``add``, ``upsert``,
+    ``update``, ``delete``) acquire ``mine_palace_lock(palace_path)`` for the
+    duration of the underlying chromadb call. This serializes MCP and other
+    direct-backend writers against ``mempalace mine`` and against each other,
+    closing the race between concurrent writers that triggers ChromaDB's
+    multi-threaded HNSW corruption (#974/#965).
+
+    The lock is the same primitive used by ``miner.mine()`` so re-entrant
+    acquisition from inside the mine pipeline (mine -> _mine_body ->
+    collection.upsert) is short-circuited by the per-thread guard inside
+    ``mine_palace_lock`` — no self-deadlock.
+
+    ``palace_path=None`` disables the wrapping, preserving the legacy
+    no-lock behaviour for callers that construct a ``ChromaCollection``
+    directly without going through ``ChromaBackend``.
+    """
+
+    def __init__(self, collection, palace_path: Optional[str] = None):
         self._collection = collection
+        self._palace_path = palace_path
+
+    @contextlib.contextmanager
+    def _write_lock(self):
+        """Acquire ``mine_palace_lock`` for the configured palace, if any.
+
+        No-op (yields immediately) when ``self._palace_path`` is None.
+        """
+        if self._palace_path is None:
+            yield
+            return
+        # Late import — palace.py imports ChromaBackend from this module.
+        from ..palace import mine_palace_lock
+
+        with mine_palace_lock(self._palace_path):
+            yield
 
     # ------------------------------------------------------------------
     # Writes
@@ -588,7 +622,8 @@ class ChromaCollection(BaseCollection):
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.add(**kwargs)
+        with self._write_lock():
+            self._collection.add(**kwargs)
 
     def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
         kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
@@ -596,7 +631,8 @@ class ChromaCollection(BaseCollection):
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.upsert(**kwargs)
+        with self._write_lock():
+            self._collection.upsert(**kwargs)
 
     def update(
         self,
@@ -615,7 +651,8 @@ class ChromaCollection(BaseCollection):
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.update(**kwargs)
+        with self._write_lock():
+            self._collection.update(**kwargs)
 
     # ------------------------------------------------------------------
     # Reads
@@ -759,7 +796,8 @@ class ChromaCollection(BaseCollection):
             kwargs["ids"] = ids
         if where is not None:
             kwargs["where"] = where
-        self._collection.delete(**kwargs)
+        with self._write_lock():
+            self._collection.delete(**kwargs)
 
     def count(self):
         return self._collection.count()
@@ -998,7 +1036,7 @@ class ChromaBackend(BaseBackend):
         else:
             collection = client.get_collection(collection_name, **ef_kwargs)
         _pin_hnsw_threads(collection)
-        return ChromaCollection(collection)
+        return ChromaCollection(collection, palace_path=palace_path)
 
     def close_palace(self, palace) -> None:
         """Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
@@ -1045,7 +1083,7 @@ class ChromaBackend(BaseBackend):
             metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
             **ef_kwargs,
         )
-        return ChromaCollection(collection)
+        return ChromaCollection(collection, palace_path=palace_path)
 
 
 def _normalize_get_collection_args(args, kwargs):
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 9cc454e..7e22227 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -288,13 +288,13 @@ def _get_collection(create=False):
                 metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
             )
             _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw)
+            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
             _metadata_cache = None
             _metadata_cache_time = 0
         elif _collection_cache is None:
             raw = client.get_collection(_config.collection_name)
             _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw)
+            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
             _metadata_cache = None
             _metadata_cache_time = 0
         return _collection_cache
diff --git a/mempalace/palace.py b/mempalace/palace.py
index 07efb6a..97f67ff 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -8,6 +8,7 @@ import contextlib
 import hashlib
 import os
 import re
+import threading
 
 from .backends.chroma import ChromaBackend
 
@@ -314,6 +315,47 @@ class MineAlreadyRunning(RuntimeError):
     """Raised when another `mempalace mine` already holds the per-palace lock."""
 
 
+# Per-thread record of palaces this thread already holds the lock for. Used by
+# `mine_palace_lock` to short-circuit re-entrant acquisition from the same
+# thread (e.g. miner.mine() acquires the outer lock then calls
+# ChromaCollection.upsert which now also tries to acquire). Without this guard
+# the inner call would block on its own outer flock (Linux fcntl locks are per
+# open file description, so a same-thread second open of the lock file is a
+# distinct lock and self-deadlocks).
+#
+# The holder set is tagged with ``pid`` so that a forked child does NOT
+# inherit re-entrant credit from its parent: the OS-level flock IS NOT
+# inherited as a "we hold it" semantically — the child must reacquire — but
+# Python's ``threading.local`` IS inherited across fork. The pid check
+# clears stale state so a forked child correctly hits the fcntl path.
+_palace_lock_holders = threading.local()
+
+
+def _holder_state():
+    """Return the per-thread (pid, keys) record, refreshing after fork."""
+    keys = getattr(_palace_lock_holders, "keys", None)
+    pid = getattr(_palace_lock_holders, "pid", None)
+    current_pid = os.getpid()
+    if keys is None or pid != current_pid:
+        keys = set()
+        _palace_lock_holders.keys = keys
+        _palace_lock_holders.pid = current_pid
+    return keys
+
+
+def _held_by_this_thread(lock_key: str) -> bool:
+    """Return True if this thread already holds ``mine_palace_lock`` for ``lock_key``."""
+    return lock_key in _holder_state()
+
+
+def _mark_held(lock_key: str) -> None:
+    _holder_state().add(lock_key)
+
+
+def _mark_released(lock_key: str) -> None:
+    _holder_state().discard(lock_key)
+
+
 @contextlib.contextmanager
 def mine_palace_lock(palace_path: str):
     """Per-palace non-blocking lock around the full `mine` pipeline.
@@ -338,6 +380,12 @@ def mine_palace_lock(palace_path: str):
     Non-blocking: if another `mine` is already writing to this palace,
     raise MineAlreadyRunning so the caller can exit cleanly instead of
     piling up as a waiting worker.
+
+    Re-entrant: if the current thread already holds the lock for the same
+    palace, the context manager passes through without re-acquiring. This
+    lets ChromaCollection write methods (which acquire the lock themselves
+    to protect MCP/direct callers) compose with miner.mine() (which holds
+    the outer lock for the entire mine pipeline) without self-deadlock.
     """
     lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
     os.makedirs(lock_dir, exist_ok=True)
@@ -346,6 +394,11 @@ def mine_palace_lock(palace_path: str):
     palace_key = hashlib.sha256(lock_key_source.encode()).hexdigest()[:16]
     lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
 
+    if _held_by_this_thread(palace_key):
+        # Same thread already holds the lock for this palace — pass through.
+        yield
+        return
+
     lf = open(lock_path, "w")
     acquired = False
     try:
@@ -369,7 +422,11 @@ def mine_palace_lock(palace_path: str):
                 raise MineAlreadyRunning(
                     f"another `mempalace mine` is already running against {resolved}"
                 ) from exc
-        yield
+        _mark_held(palace_key)
+        try:
+            yield
+        finally:
+            _mark_released(palace_key)
     finally:
         if acquired:
             try:
diff --git a/tests/test_chroma_collection_lock.py b/tests/test_chroma_collection_lock.py
new file mode 100644
index 0000000..b5d30fb
--- /dev/null
+++ b/tests/test_chroma_collection_lock.py
@@ -0,0 +1,327 @@
+"""Tests for ChromaCollection's palace-write-lock integration.
+
+Closes the gap left by ``mine_palace_lock`` only protecting the
+``mempalace mine`` pipeline: MCP/direct writers that call
+``ChromaCollection.add/upsert/update/delete`` must also serialize against
+mine and against each other to avoid the multi-threaded HNSW corruption
+documented in #974/#965.
+
+Property tested:
+
+* ``ChromaCollection(c, palace_path=p)`` wraps every write with
+  ``mine_palace_lock(p)``.
+* Writes raise ``MineAlreadyRunning`` when another holder owns the lock
+  (instead of silently racing into the underlying chromadb call).
+* Re-entrant composition with ``miner.mine()`` does not self-deadlock:
+  ``with mine_palace_lock(p): col.upsert(...)`` runs to completion.
+* ``ChromaCollection(c)`` (no palace_path) preserves legacy no-lock
+  behaviour for tests/callers that build the adapter directly without
+  going through ``ChromaBackend``.
+
+POSIX-only: ``mine_palace_lock`` uses ``fcntl`` on Unix and ``msvcrt`` on
+Windows; the contention semantics differ enough that the cross-process
+tests are skipped on Windows runners.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import time
+
+import pytest
+
+from mempalace.backends.chroma import ChromaCollection
+from mempalace.palace import MineAlreadyRunning, mine_palace_lock
+
+
+def _get_mp_context():
+    """Same start-method picker as test_palace_locks.py."""
+    start_method = "spawn" if os.name == "nt" else "fork"
+    return multiprocessing.get_context(start_method)
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeChromaCollection:
+    """Records calls; never blocks. Stand-in for chromadb.Collection."""
+
+    def __init__(self):
+        self.adds: list[dict] = []
+        self.upserts: list[dict] = []
+        self.updates: list[dict] = []
+        self.deletes: list[dict] = []
+
+    def add(self, **kwargs):
+        self.adds.append(kwargs)
+
+    def upsert(self, **kwargs):
+        self.upserts.append(kwargs)
+
+    def update(self, **kwargs):
+        self.updates.append(kwargs)
+
+    def delete(self, **kwargs):
+        self.deletes.append(kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _hold_lock(palace_path: str, ready_flag: str, release_flag: str) -> int:
+    """Acquire ``mine_palace_lock``, signal readiness, wait for release.
+
+    Mirrors the helper in ``test_palace_locks.py`` so the contention
+    semantics match across both test files.
+    """
+    try:
+        with mine_palace_lock(palace_path):
+            open(ready_flag, "w").close()
+            for _ in range(500):
+                if os.path.exists(release_flag):
+                    return 0
+                time.sleep(0.01)
+            return 0
+    except MineAlreadyRunning:
+        return 1
+
+
+# ---------------------------------------------------------------------------
+# Tests — opt-in lock wiring
+# ---------------------------------------------------------------------------
+
+
+def test_palace_path_none_skips_lock(tmp_path, monkeypatch):
+    """Legacy callers (``ChromaCollection(c)``) keep no-lock behaviour.
+
+    A ``ChromaCollection`` built without ``palace_path`` must not touch the
+    lock infrastructure at all. This guards against regressions where a
+    test or third-party caller relies on the historical bare-write path.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake)  # no palace_path -> no lock
+
+    # Hold the lock in a child process. Without palace_path, the parent
+    # write must still succeed (the lock does not gate this caller).
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        col.upsert(documents=["doc"], ids=["id-1"])
+        assert fake.upserts == [{"documents": ["doc"], "ids": ["id-1"]}]
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_writer_blocks_during_mine(tmp_path, monkeypatch):
+    """A held ``mine_palace_lock`` causes ``ChromaCollection`` writes to raise.
+
+    This is the property that closes the MCP-bypass gap: when a mine is in
+    flight, MCP/direct writes raise ``MineAlreadyRunning`` rather than
+    silently entering chromadb's write path concurrent with mine.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        fake = _FakeChromaCollection()
+        col = ChromaCollection(fake, palace_path=palace)
+
+        with pytest.raises(MineAlreadyRunning):
+            col.upsert(documents=["doc"], ids=["id-1"])
+        with pytest.raises(MineAlreadyRunning):
+            col.add(documents=["doc"], ids=["id-2"])
+        with pytest.raises(MineAlreadyRunning):
+            col.update(ids=["id-3"], documents=["doc"])
+        with pytest.raises(MineAlreadyRunning):
+            col.delete(ids=["id-4"])
+
+        # The fake must have received NO calls — the lock must gate
+        # before reaching the underlying chromadb layer.
+        assert fake.upserts == []
+        assert fake.adds == []
+        assert fake.updates == []
+        assert fake.deletes == []
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_inside_mine_passes_through(tmp_path, monkeypatch):
+    """``ChromaCollection.upsert`` inside ``mine_palace_lock`` does not deadlock.
+
+    ``miner.mine()`` already holds ``mine_palace_lock(palace_path)`` for the
+    full mine pipeline; ``_mine_body`` then calls
+    ``collection.upsert(...)``. With the per-thread re-entrant guard in
+    ``mine_palace_lock``, the inner acquire is a pass-through and the
+    underlying chromadb call runs immediately.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake, palace_path=palace)
+
+    with mine_palace_lock(palace):
+        # If the re-entrant guard were missing, this would self-deadlock on
+        # the underlying flock. We rely on pytest-timeout (configured in
+        # pyproject.toml) to enforce this in CI; the assertion just confirms
+        # the call landed.
+        col.upsert(documents=["d"], ids=["i"], metadatas=[{"k": "v"}])
+        col.add(documents=["d2"], ids=["i2"])
+        col.update(ids=["i"], documents=["d-updated"])
+        col.delete(ids=["i2"])
+
+    assert len(fake.upserts) == 1
+    assert len(fake.adds) == 1
+    assert len(fake.updates) == 1
+    assert len(fake.deletes) == 1
+
+
+class _SlowFakeChromaCollection(_FakeChromaCollection):
+    """Fake whose write methods hold the caller for ``hold_seconds``.
+
+    Used to keep ``mine_palace_lock`` acquired long enough for a sibling
+    process to contend deterministically.
+    """
+
+    def __init__(self, hold_seconds: float = 0.3):
+        super().__init__()
+        self._hold = hold_seconds
+
+    def upsert(self, **kwargs):
+        time.sleep(self._hold)
+        super().upsert(**kwargs)
+
+
+def _slow_writer_target(palace_path, tmp_path_str, pid, result_q):
+    """Subprocess target: try a slow upsert, report ok/busy."""
+    os.environ["HOME"] = tmp_path_str
+    # Fresh import inside child so HOME monkeypatch routes the lock dir.
+    from mempalace.backends.chroma import ChromaCollection as _CC
+    from mempalace.palace import MineAlreadyRunning as _MAR
+
+    fake = _SlowFakeChromaCollection(hold_seconds=0.3)
+    col = _CC(fake, palace_path=palace_path)
+    try:
+        col.upsert(documents=[f"d{pid}"], ids=[f"i{pid}"])
+        result_q.put(("ok", pid))
+    except _MAR:
+        result_q.put(("busy", pid))
+
+
+def test_concurrent_writers_serialize(tmp_path, monkeypatch):
+    """Two processes calling ``ChromaCollection.upsert`` against the same
+    palace must be serialized: at most one enters chromadb at a time, the
+    other raises ``MineAlreadyRunning``.
+
+    This is the property that prevents the parallel HNSW insert race that
+    drives #974/#965 — under concurrent MCP write fan-out, exactly one
+    writer reaches chromadb and the rest fail loudly instead of corrupting
+    the index.
+
+    The slow fake holds the lock for 0.3s per writer, large enough for the
+    second process to contend even on slow CI runners.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+
+    ctx = _get_mp_context()
+    result_q = ctx.Queue()
+
+    p1 = ctx.Process(
+        target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q)
+    )
+    p2 = ctx.Process(
+        target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q)
+    )
+    p1.start()
+    # Tiny stagger so p1 wins the race deterministically; without it the
+    # OS scheduler can pick either, which is also a valid outcome but
+    # makes the assertion brittle on slow CI.
+    time.sleep(0.05)
+    p2.start()
+    p1.join(timeout=5)
+    p2.join(timeout=5)
+
+    outcomes = [result_q.get(timeout=1) for _ in range(2)]
+    statuses = sorted(o[0] for o in outcomes)
+    assert statuses == ["busy", "ok"], (
+        f"expected one ok + one busy, got {outcomes}"
+    )
+
+
+def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
+    """``query`` / ``get`` / ``count`` must not be gated by the write lock.
+
+    Read traffic is the dominant workload (semantic search, MCP get, etc.)
+    and serializing it against mine would tank latency for no correctness
+    benefit. This test pins that property: with another process holding
+    the write lock, reads must still complete instantly.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        # _FakeChromaCollection doesn't implement query/get/count; we only
+        # need to confirm the wrapper does not call into mine_palace_lock
+        # for reads, which we assert by observing the wrapped methods are
+        # NOT in ChromaCollection's _write_lock path. A direct check via
+        # source inspection is more honest than mocking the entire chroma
+        # surface here.
+        import inspect
+
+        from mempalace.backends.chroma import ChromaCollection as _CC
+
+        for write_attr in ("add", "upsert", "update", "delete"):
+            src = inspect.getsource(getattr(_CC, write_attr))
+            assert "_write_lock" in src, f"{write_attr} should acquire write lock"
+
+        for read_attr in ("query", "get", "count"):
+            method = getattr(_CC, read_attr, None)
+            if method is None:
+                continue
+            src = inspect.getsource(method)
+            assert "_write_lock" not in src, (
+                f"{read_attr} must NOT acquire the write lock (read path)"
+            )
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 601c894..39aa50c 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -135,19 +135,77 @@ def test_different_palaces_dont_conflict(tmp_path, monkeypatch):
 
 
 def test_palace_path_is_normalized(tmp_path, monkeypatch):
-    """Relative and absolute forms of the same path must use the same lock."""
+    """Relative and absolute forms of the same path must use the same lock.
+
+    Cross-process variant: a child holds the absolute form, a relative form
+    in the parent must hash to the same lock key and raise
+    ``MineAlreadyRunning``. (The same-thread case is now a re-entrant
+    pass-through by design — see ``test_reentrant_same_thread_passes_through``
+    — so we exercise the normalization invariant across a process boundary
+    where re-entrance does not apply.)
+    """
     monkeypatch.setenv("HOME", str(tmp_path))
     monkeypatch.chdir(tmp_path)
     os.makedirs(tmp_path / "palace", exist_ok=True)
     absolute = str(tmp_path / "palace")
-    relative = "palace"
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
 
-    # Hold the lock with the absolute form; attempting to re-acquire with
-    # the relative form (which resolves to the same absolute path) must fail.
-    with mine_palace_lock(absolute):
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(absolute, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+        # Parent holds CWD = tmp_path so "palace" is the same on-disk dir as
+        # the absolute form. The lock key is sha256(realpath+normcase) so the
+        # two forms must collide.
         with pytest.raises(MineAlreadyRunning):
-            with mine_palace_lock(relative):
+            with mine_palace_lock("palace"):
                 pytest.fail("normalized path collision should have raised")
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
+    """Same thread re-acquiring the same palace lock must not deadlock or raise.
+
+    This is the invariant that makes ``ChromaCollection`` write methods (which
+    take ``mine_palace_lock`` for MCP/direct-writer protection) compose with
+    ``miner.mine()`` (which already holds the lock for the entire mine
+    pipeline). Without the per-thread re-entrant guard the inner acquire
+    would self-deadlock on the outer flock.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    with mine_palace_lock(palace):
+        # Re-enter from the same thread — must yield without raising or hanging.
+        with mine_palace_lock(palace):
+            pass
+        # After the inner exits, the outer is still held: confirm via a
+        # subprocess that tries to acquire and reports back.
+        ctx = _get_mp_context()
+        result_q = ctx.Queue()
+        child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
+        child.start()
+        child.join(timeout=5)
+        assert result_q.get(timeout=1) == "busy", (
+            "outer lock should still be held by parent after inner re-entrant exit"
+        )
+
+
+def _try_acquire_expect_busy(palace_path, result_q):
+    """Helper: try to acquire, push 'busy' (raised) or 'free' (acquired) into queue."""
+    try:
+        with mine_palace_lock(palace_path):
+            result_q.put("free")
+    except MineAlreadyRunning:
+        result_q.put("busy")
 
 
 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):

From db28bf1e846592f997835ca42050e43c12b09303 Mon Sep 17 00:00:00 2001
From: sha2fiddy <103975074+sha2fiddy@users.noreply.github.com>
Date: Wed, 29 Apr 2026 19:01:54 -0400
Subject: [PATCH 06/60] fix: paginate closet_llm col.get (#1073)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the pagination pattern PR #851 landed in miner.py:status().
A single drawers_col.get(limit=total, ...) on palaces larger than
SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) crashes inside chromadb.

Fetch drawers in batch_size=5000 chunks, stepping offset until the
collection is drained. by_source aggregation semantics are preserved
exactly — grouping, wing filter, meta capture all unchanged.

Closes #1073. Related: #802, #850, #1016.
---
 mempalace/closet_llm.py  |  33 +++++---
 tests/test_closet_llm.py | 176 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+), 11 deletions(-)

diff --git a/mempalace/closet_llm.py b/mempalace/closet_llm.py
index c00b735..6274f79 100644
--- a/mempalace/closet_llm.py
+++ b/mempalace/closet_llm.py
@@ -221,17 +221,28 @@ def regenerate_closets(
         print("No drawers in palace.")
         return {"processed": 0}
 
-    all_data = drawers_col.get(limit=total, include=["documents", "metadatas"])
-    by_source = {}
-    for doc_id, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
-        source = meta.get("source_file", "unknown")
-        w = meta.get("wing", "")
-        if wing and w != wing:
-            continue
-        if source not in by_source:
-            by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
-        by_source[source]["drawer_ids"].append(doc_id)
-        by_source[source]["content"].append(doc)
+    # Paginate the fetch — a single get(limit=total, ...) blows through
+    # SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) on large palaces and
+    # crashes inside chromadb (see #802, #850, #1073).
+    by_source: dict = {}
+    batch_size = 5000
+    offset = 0
+    while offset < total:
+        batch = drawers_col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
+        ids = batch["ids"]
+        if not ids:
+            break
+        for doc_id, doc, meta in zip(ids, batch["documents"], batch["metadatas"]):
+            meta = meta or {}
+            source = meta.get("source_file", "unknown")
+            w = meta.get("wing", "")
+            if wing and w != wing:
+                continue
+            if source not in by_source:
+                by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
+            by_source[source]["drawer_ids"].append(doc_id)
+            by_source[source]["content"].append(doc)
+        offset += len(ids)
 
     sources = list(by_source.keys())
     if sample > 0:
diff --git a/tests/test_closet_llm.py b/tests/test_closet_llm.py
index a92e2fa..3a0e84e 100644
--- a/tests/test_closet_llm.py
+++ b/tests/test_closet_llm.py
@@ -296,6 +296,182 @@ class TestRegenerateClosets:
             assert meta.get("generated_by", "").startswith("llm:")
             assert meta.get("normalize_version") == NORMALIZE_VERSION
 
+    def test_regen_paginates_drawer_fetch(self, tmp_path):
+        """Regression for #1073: drawers_col.get must be paginated at
+        batch_size=5000. A single get(limit=total, ...) on a palace with
+        more than SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) drawers
+        blows up inside chromadb. Matches the miner.status pattern
+        introduced in #851 (see #802, #850, #1073)."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # Build a fake collection: 12_000 drawers across 3 source files,
+        # enough to force 3 batches of batch_size=5000 (5000 + 5000 + 2000).
+        n_drawers = 12_000
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"doc body {i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 3}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        get_calls: list = []
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                get_calls.append({"limit": limit, "offset": offset, "include": include})
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            """Accept the purge + upsert calls the success path makes."""
+
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        fake_drawers = FakeDrawersCol()
+        fake_closets = FakeClosetsCol()
+
+        def fake_urlopen(req, timeout=None):
+            return _FakeResp(
+                {
+                    "choices": [
+                        {"message": {"content": '{"topics":["t1"],"quotes":[],"summary":""}'}}
+                    ],
+                    "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                }
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=fake_drawers),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=fake_closets),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+        ):
+            result = regenerate_closets(palace, cfg=cfg, dry_run=True)
+
+        # Three paginated calls: (limit=5000, offset=0), (5000, 5000), (5000, 10000).
+        assert len(get_calls) == 3, f"expected 3 batched fetches, got {len(get_calls)}"
+        for call in get_calls:
+            assert (
+                call["limit"] == 5000
+            ), f"batch must be 5000 — got {call['limit']} (would risk SQLITE_MAX_VARIABLE_NUMBER)"
+            # include must still request both documents and metadatas
+            assert "documents" in call["include"]
+            assert "metadatas" in call["include"]
+        assert [c["offset"] for c in get_calls] == [0, 5000, 10_000]
+
+        # by_source aggregation must be preserved exactly across batches:
+        # 12_000 drawers, 3 source files → 4_000 drawers each.
+        # dry_run=True short-circuits LLM calls but still walks by_source.
+        assert result.get("processed", 0) == 0  # dry_run
+        # Verify no single call tried to pull more than batch_size.
+        assert max(c["limit"] for c in get_calls) <= 5000
+
+    def test_regen_by_source_aggregates_across_batches(self, tmp_path):
+        """Pagination must not change the by_source grouping — drawers for
+        the same source_file split across batches still land in one group."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # 7_500 drawers, alternating between two source files → forces
+        # splits across the 5000/2500 boundary. Each source ends up with
+        # 3_750 drawers after regrouping.
+        n_drawers = 7_500
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"body-{i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 2}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        captured_sources: dict = {}
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        # Hook _call_llm to inspect what regenerate_closets aggregated
+        # per source before the HTTP boundary.
+        real_call_llm = closet_llm_mod._call_llm
+
+        def spying_call_llm(cfg, source_file, wing, room, content):
+            captured_sources[source_file] = content
+            return (
+                {"topics": ["t"], "quotes": [], "summary": ""},
+                {"prompt_tokens": 1, "completion_tokens": 1},
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=FakeDrawersCol()),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=FakeClosetsCol()),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch.object(closet_llm_mod, "_call_llm", side_effect=spying_call_llm),
+        ):
+            regenerate_closets(palace, cfg=cfg)
+
+        # Both sources survived the pagination boundary.
+        assert set(captured_sources.keys()) == {"/src/file_0.md", "/src/file_1.md"}
+        # Each source accumulated exactly 3_750 drawer bodies, concatenated
+        # with the "\n\n" separator the regenerate path uses.
+        for source, content in captured_sources.items():
+            assert content.count("\n\n") == 3_749, (
+                f"{source}: expected 3_750 chunks joined (3_749 separators), "
+                f"got {content.count(chr(10) + chr(10)) + 1}"
+            )
+
+        # Silence unused-var lint.
+        assert real_call_llm is not None
+
     def test_regen_uses_basename_not_split_slash(self, tmp_path, monkeypatch):
         """Regression: the old closet_id base used ``source.split('/')[-1]``
         which silently degrades on Windows paths (``C:\\proj\\a.md`` →

From 4d98b0524084f7682f2b7df04c77be2e4312c081 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Fri, 24 Apr 2026 11:09:16 +0200
Subject: [PATCH 07/60] fix(kg): validate ISO-8601 date formats at MCP boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tool_kg_query (as_of), tool_kg_add (valid_from), and tool_kg_invalidate
(ended) accepted any string and forwarded it to SQLite without format
validation. Parameterized queries prevent SQL injection, but invalid
date strings silently produce empty result sets — callers cannot
distinguish "no fact at this time" from "your date format was
unrecognized." This is especially painful for natural-language LLM
callers that synthesize dates like "March 2026" or "Jan 2025".

Add sanitize_iso_date() in config.py alongside the other input
validators. It accepts YYYY, YYYY-MM, and YYYY-MM-DD forms; passes
through None/empty; and raises ValueError with a field-named message
on anything else. Call it from the three kg MCP tool wrappers before
values reach the storage layer so the caller gets a clear error
instead of a silent miss.

Closes #1164
---
 mempalace/config.py      | 28 ++++++++++++++++
 mempalace/mcp_server.py  |  4 +++
 tests/test_config.py     | 70 +++++++++++++++++++++++++++++++++++++++-
 tests/test_mcp_server.py | 46 ++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index cacd1f9..4005779 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -81,6 +81,34 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
     return value
 
 
+# ISO-8601 date validator for knowledge-graph temporal parameters
+# (as_of, valid_from, valid_to, ended). Parameterized queries already
+# prevent SQL injection, but unvalidated date strings silently miss
+# every row — callers cannot distinguish "no fact at this time" from
+# "your date format was unrecognized." Accept YYYY, YYYY-MM, YYYY-MM-DD.
+_ISO_DATE_RE = re.compile(r"^\d{4}(?:-(?:0[1-9]|1[0-2])(?:-(?:0[1-9]|[12]\d|3[01]))?)?$")
+
+
+def sanitize_iso_date(value, field_name: str = "date"):
+    """Validate an ISO-8601 date string, accepting None or empty as-is.
+
+    Accepts ``YYYY``, ``YYYY-MM``, or ``YYYY-MM-DD``. Raises ValueError
+    on any other non-empty input so the MCP layer can surface a clear
+    error to the caller instead of silently returning empty results.
+    """
+    if value is None or value == "":
+        return value
+    if not isinstance(value, str):
+        raise ValueError(f"{field_name} must be a string")
+    value = value.strip()
+    if not _ISO_DATE_RE.match(value):
+        raise ValueError(
+            f"{field_name}={value!r} is not a valid ISO-8601 date "
+            f"(expected YYYY, YYYY-MM, or YYYY-MM-DD)"
+        )
+    return value
+
+
 def sanitize_content(value: str, max_length: int = 100_000) -> str:
     """Validate drawer/diary content length."""
     if not isinstance(value, str) or not value.strip():
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 43897c8..8aecd05 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -55,6 +55,7 @@ from .config import (  # noqa: E402
     sanitize_kg_value,
     sanitize_name,
     sanitize_content,
+    sanitize_iso_date,
 )
 from .version import __version__  # noqa: E402
 from .backends.chroma import (  # noqa: E402
@@ -1021,6 +1022,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
     """Query the knowledge graph for an entity's relationships."""
     try:
         entity = sanitize_kg_value(entity, "entity")
+        as_of = sanitize_iso_date(as_of, "as_of")
     except ValueError as e:
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
@@ -1037,6 +1039,7 @@ def tool_kg_add(
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
+        valid_from = sanitize_iso_date(valid_from, "valid_from")
     except ValueError as e:
         return {"success": False, "error": str(e)}
 
@@ -1062,6 +1065,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
+        ended = sanitize_iso_date(ended, "ended")
     except ValueError as e:
         return {"success": False, "error": str(e)}
     _wal_log(
diff --git a/tests/test_config.py b/tests/test_config.py
index d7707d9..f5064e2 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -3,7 +3,13 @@ import json
 import tempfile
 
 import pytest
-from mempalace.config import MempalaceConfig, normalize_wing_name, sanitize_kg_value, sanitize_name
+from mempalace.config import (
+    MempalaceConfig,
+    normalize_wing_name,
+    sanitize_iso_date,
+    sanitize_kg_value,
+    sanitize_name,
+)
 
 
 def test_default_config():
@@ -212,3 +218,65 @@ def test_kg_value_rejects_null_bytes():
 def test_kg_value_rejects_over_length():
     with pytest.raises(ValueError):
         sanitize_kg_value("a" * 129)
+
+
+# --- sanitize_iso_date ---
+
+
+def test_iso_date_accepts_year_only():
+    assert sanitize_iso_date("2026") == "2026"
+
+
+def test_iso_date_accepts_year_month():
+    assert sanitize_iso_date("2026-03") == "2026-03"
+
+
+def test_iso_date_accepts_full_date():
+    assert sanitize_iso_date("2026-03-15") == "2026-03-15"
+
+
+def test_iso_date_passes_through_none():
+    assert sanitize_iso_date(None) is None
+
+
+def test_iso_date_passes_through_empty_string():
+    assert sanitize_iso_date("") == ""
+
+
+def test_iso_date_strips_whitespace():
+    assert sanitize_iso_date("  2026-03-15  ") == "2026-03-15"
+
+
+def test_iso_date_rejects_natural_language():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("March 2026")
+
+
+def test_iso_date_rejects_abbreviated_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("Jan 2025")
+
+
+def test_iso_date_rejects_us_format():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("03/15/2026")
+
+
+def test_iso_date_rejects_invalid_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-13")
+
+
+def test_iso_date_rejects_invalid_day():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-02-32")
+
+
+def test_iso_date_rejects_non_string():
+    with pytest.raises(ValueError):
+        sanitize_iso_date(20260315)
+
+
+def test_iso_date_error_names_field():
+    with pytest.raises(ValueError, match="valid_from"):
+        sanitize_iso_date("yesterday", "valid_from")
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 480b6bd..1b80f36 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -665,6 +665,52 @@ class TestKGTools:
         result = tool_kg_stats()
         assert result["entities"] >= 4
 
+    # --- Date validation at the MCP boundary (issue #1164) ---
+
+    def test_kg_add_rejects_invalid_valid_from(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="Alice",
+            predicate="likes",
+            object="coffee",
+            valid_from="Jan 2025",
+        )
+        assert result["success"] is False
+        assert "valid_from" in result["error"]
+        assert "ISO-8601" in result["error"]
+
+    def test_kg_query_rejects_invalid_as_of(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        result = tool_kg_query(entity="Max", as_of="March 2026")
+        assert "error" in result
+        assert "as_of" in result["error"]
+
+    def test_kg_invalidate_rejects_invalid_ended(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        result = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="chess",
+            ended="yesterday",
+        )
+        assert result["success"] is False
+        assert "ended" in result["error"]
+
+    def test_kg_query_accepts_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        # YYYY and YYYY-MM are valid ISO-8601 forms — must not be rejected.
+        for value in ("2026", "2026-03", "2026-03-15"):
+            result = tool_kg_query(entity="Max", as_of=value)
+            assert "error" not in result, f"rejected valid date {value!r}: {result}"
+
 
 # ── Diary Tools ─────────────────────────────────────────────────────────
 

From abe85763d4da974b8652bdfe3654bee3a7534034 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 12:50:43 +0200
Subject: [PATCH 08/60] fix(kg): reject partial ISO dates to avoid silent empty
 result sets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per qodo-ai review on PR #1167: sanitize_iso_date() previously accepted
YYYY and YYYY-MM, but KnowledgeGraph.query_entity() compares valid_from/
valid_to TEXT columns lexicographically against as_of. Lexicographic
comparison treats '2026-01-01' as greater than '2026' (because '-' >
end-of-string), so partial as_of values silently excluded valid facts —
re-introducing the silent-empty-results problem this PR was meant to
fix.

Tighten _ISO_DATE_RE to require YYYY-MM-DD only. Update docstring and
error message accordingly. Invert the two test cases that asserted
partials were accepted.
---
 mempalace/config.py      | 18 +++++++++++-------
 tests/test_config.py     | 12 ++++++++----
 tests/test_mcp_server.py | 15 +++++++++++----
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index 4005779..2252a49 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -85,16 +85,21 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
 # (as_of, valid_from, valid_to, ended). Parameterized queries already
 # prevent SQL injection, but unvalidated date strings silently miss
 # every row — callers cannot distinguish "no fact at this time" from
-# "your date format was unrecognized." Accept YYYY, YYYY-MM, YYYY-MM-DD.
-_ISO_DATE_RE = re.compile(r"^\d{4}(?:-(?:0[1-9]|1[0-2])(?:-(?:0[1-9]|[12]\d|3[01]))?)?$")
+# "your date format was unrecognized." Require full YYYY-MM-DD: KG
+# queries compare TEXT dates lexicographically, so partials like "2026"
+# would re-introduce silent empty results (e.g. "2026-01-01" <= "2026"
+# is False), defeating the purpose of validation.
+_ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
 
 
 def sanitize_iso_date(value, field_name: str = "date"):
     """Validate an ISO-8601 date string, accepting None or empty as-is.
 
-    Accepts ``YYYY``, ``YYYY-MM``, or ``YYYY-MM-DD``. Raises ValueError
-    on any other non-empty input so the MCP layer can surface a clear
-    error to the caller instead of silently returning empty results.
+    Accepts only ``YYYY-MM-DD``. Raises ValueError on any other
+    non-empty input so the MCP layer can surface a clear error to the
+    caller instead of silently returning empty results. Partial dates
+    (``YYYY``, ``YYYY-MM``) are rejected because KG queries compare
+    TEXT dates lexicographically and would silently exclude valid facts.
     """
     if value is None or value == "":
         return value
@@ -103,8 +108,7 @@ def sanitize_iso_date(value, field_name: str = "date"):
     value = value.strip()
     if not _ISO_DATE_RE.match(value):
         raise ValueError(
-            f"{field_name}={value!r} is not a valid ISO-8601 date "
-            f"(expected YYYY, YYYY-MM, or YYYY-MM-DD)"
+            f"{field_name}={value!r} is not a valid ISO-8601 date " f"(expected YYYY-MM-DD)"
         )
     return value
 
diff --git a/tests/test_config.py b/tests/test_config.py
index f5064e2..204faae 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -223,12 +223,16 @@ def test_kg_value_rejects_over_length():
 # --- sanitize_iso_date ---
 
 
-def test_iso_date_accepts_year_only():
-    assert sanitize_iso_date("2026") == "2026"
+def test_iso_date_rejects_year_only():
+    # Partial dates re-introduce silent empty result sets via lexicographic
+    # TEXT comparison in KG queries (e.g. "2026-01-01" <= "2026" is False).
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026")
 
 
-def test_iso_date_accepts_year_month():
-    assert sanitize_iso_date("2026-03") == "2026-03"
+def test_iso_date_rejects_year_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-03")
 
 
 def test_iso_date_accepts_full_date():
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 1b80f36..136b6f3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -702,14 +702,21 @@ class TestKGTools:
         assert result["success"] is False
         assert "ended" in result["error"]
 
-    def test_kg_query_accepts_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+    def test_kg_query_rejects_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)
         from mempalace.mcp_server import tool_kg_query
 
-        # YYYY and YYYY-MM are valid ISO-8601 forms — must not be rejected.
-        for value in ("2026", "2026-03", "2026-03-15"):
+        # Partial ISO dates are rejected: KG queries compare TEXT dates
+        # lexicographically, so "2026-01-01" <= "2026" is False, which
+        # silently excludes facts. Reject at the boundary — only YYYY-MM-DD
+        # produces correct results.
+        for value in ("2026", "2026-03"):
             result = tool_kg_query(entity="Max", as_of=value)
-            assert "error" not in result, f"rejected valid date {value!r}: {result}"
+            assert "error" in result, f"accepted partial date {value!r}: {result}"
+
+        # Full ISO-8601 dates still pass.
+        result = tool_kg_query(entity="Max", as_of="2026-03-15")
+        assert "error" not in result, f"rejected valid date: {result}"
 
 
 # ── Diary Tools ─────────────────────────────────────────────────────────

From c3e1104e75d5310aeb2147c999fb19af8a9b1dcc Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 09/60] fix(chroma): harden HNSW startup preflight

---
 mempalace/backends/chroma.py | 131 ++++++++++++++--
 tests/test_backends.py       | 283 ++++++++++++++++++++++++++++++++++-
 2 files changed, 395 insertions(+), 19 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 01ac627..646969b 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -3,7 +3,9 @@
 import datetime as _dt
 import logging
 import os
+import pickle
 import sqlite3
+from numbers import Integral
 from pathlib import Path
 from typing import Any, Optional
 
@@ -591,6 +593,97 @@ def _pin_hnsw_threads(collection) -> None:
 _BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
 
 
+def _valid_dimensionality(value: object) -> bool:
+    return isinstance(value, Integral) and not isinstance(value, bool) and int(value) > 0
+
+
+def _persisted_metadata_fields(obj: object) -> tuple[object, object]:
+    if isinstance(obj, dict):
+        return obj.get("dimensionality"), obj.get("id_to_label")
+    return getattr(obj, "dimensionality", None), getattr(obj, "id_to_label", None)
+
+
+def quarantine_invalid_hnsw_metadata(palace_path: str) -> list[str]:
+    """Quarantine segment dirs whose ``index_metadata.pickle`` is unreadable or invalid.
+
+    Chroma's persisted HNSW metadata is untrusted disk state. If a segment has
+    labels but no valid positive dimensionality, current Chroma versions can
+    accept the pickle and crash later in the Rust loader. We rename the entire
+    segment out of the way before ``PersistentClient`` opens so Chroma can
+    rebuild cleanly instead of touching known-bad metadata.
+    """
+    try:
+        entries = os.listdir(palace_path)
+    except OSError:
+        return []
+
+    moved: list[str] = []
+    for name in entries:
+        if "-" not in name or name.startswith(".") or ".drift-" in name or ".corrupt-" in name:
+            continue
+        seg_dir = os.path.join(palace_path, name)
+        if not os.path.isdir(seg_dir):
+            continue
+
+        meta_path = os.path.join(seg_dir, "index_metadata.pickle")
+        if not os.path.isfile(meta_path):
+            continue
+
+        reason = None
+        try:
+            persisted = _SafePersistentDataUnpickler.load(meta_path)
+        except (EOFError, OSError):
+            logger.debug(
+                "Skipping invalid-HNSW quarantine for transient metadata read in %s",
+                meta_path,
+                exc_info=True,
+            )
+            continue
+        except pickle.UnpicklingError as exc:
+            if "truncated" in str(exc).lower() or "ran out of input" in str(exc).lower():
+                logger.debug(
+                    "Skipping invalid-HNSW quarantine for transient metadata read in %s",
+                    meta_path,
+                    exc_info=True,
+                )
+                continue
+            reason = f"invalid index_metadata.pickle: {exc}"
+        except Exception as exc:
+            reason = f"invalid index_metadata.pickle: {exc}"
+        else:
+            if not isinstance(persisted, dict) and not (
+                hasattr(persisted, "dimensionality") or hasattr(persisted, "id_to_label")
+            ):
+                reason = f"unrecognized index_metadata.pickle payload: {type(persisted).__name__}"
+            else:
+                dimensionality, id_to_label = _persisted_metadata_fields(persisted)
+                if id_to_label is not None and not isinstance(id_to_label, dict):
+                    reason = f"invalid id_to_label type {type(id_to_label).__name__}"
+                else:
+                    has_labels = bool(id_to_label)
+                    if has_labels and not _valid_dimensionality(dimensionality):
+                        reason = (
+                            "labels present but dimensionality is missing or invalid "
+                            f"({dimensionality!r})"
+                        )
+                    elif dimensionality is not None and not _valid_dimensionality(dimensionality):
+                        reason = f"invalid dimensionality {dimensionality!r}"
+
+        if reason is None:
+            continue
+
+        stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
+        target = f"{seg_dir}.corrupt-{stamp}"
+        try:
+            os.rename(seg_dir, target)
+            moved.append(target)
+            logger.warning("Quarantined invalid HNSW metadata in %s: %s", seg_dir, reason)
+        except OSError:
+            logger.exception("Failed to quarantine invalid HNSW metadata in %s", seg_dir)
+
+    return moved
+
+
 def _fix_blob_seq_ids(palace_path: str) -> None:
     """Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
 
@@ -994,6 +1087,12 @@ class ChromaBackend(BaseBackend):
 
         if cached is None or inode_changed or mtime_changed or mtime_appeared:
             _fix_blob_seq_ids(palace_path)
+            if inode_changed:
+                ChromaBackend._quarantined_paths.discard(palace_path)
+            if palace_path not in ChromaBackend._quarantined_paths:
+                quarantine_invalid_hnsw_metadata(palace_path)
+                quarantine_stale_hnsw(palace_path)
+                ChromaBackend._quarantined_paths.add(palace_path)
             cached = chromadb.PersistentClient(path=palace_path)
             self._clients[palace_path] = cached
             # Re-stat after the client constructor runs: chromadb creates
@@ -1006,26 +1105,27 @@ class ChromaBackend(BaseBackend):
     # Public static helpers (legacy; prefer :meth:`get_collection`)
     # ------------------------------------------------------------------
 
-    # Per-process record of palaces that have already had quarantine_stale_hnsw
-    # invoked at least once. The proactive drift check is a *cold-start*
-    # protection — it catches HNSW segments that arrived stale relative to
-    # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
-    # crashed-mid-write). Once a long-running process has opened the palace
-    # cleanly, re-firing on every reconnect is a *runtime thrash*: the
-    # daemon's own writes bump sqlite mtime but HNSW flushes batch on
-    # chromadb's internal cadence, so the mtime gap naturally exceeds the
-    # threshold under steady write load even though nothing is corrupt.
+    # Per-process record of palaces that have already had the cold-start
+    # quarantine invoked at least once. The proactive HNSW checks are a
+    # *cold-start* protection — they catch segments that arrive stale relative
+    # to ``chroma.sqlite3`` or invalid on disk (e.g. cross-machine replication,
+    # partial restore, crashed-mid-write). Once a long-running process has
+    # opened the palace cleanly, re-firing the stale check on every reconnect
+    # is a *runtime thrash*: the daemon's own writes bump sqlite mtime but HNSW
+    # flushes batch on chromadb's internal cadence, so the mtime gap naturally
+    # exceeds the threshold under steady write load even though nothing is
+    # corrupt.
     # Real runtime drift is still handled — palace-daemon's ``_auto_repair``
     # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
     # which bypasses this gate.
     #
     # Thread-safety: this set is mutated without a lock. Two concurrent
     # ``make_client()`` calls for the same palace can both pass the
-    # membership check and both invoke ``quarantine_stale_hnsw``. That's
-    # safe because the function is idempotent (mtime check + timestamped
-    # rename of distinct directories), so the worst-case race produces
-    # one redundant rename attempt that no-ops. Idempotency is the
-    # safety property; locking would add cost without correctness gain.
+    # membership check and both invoke the cold-start quarantine. That's
+    # safe because the functions are idempotent (mtime checks + timestamped
+    # rename of distinct directories), so the worst-case race produces one
+    # redundant rename attempt that no-ops. Idempotency is the safety
+    # property; locking would add cost without correctness gain.
     _quarantined_paths: set[str] = set()
 
     @staticmethod
@@ -1036,12 +1136,13 @@ class ChromaBackend(BaseBackend):
         own client cache. New code should obtain a collection through
         :meth:`get_collection` which manages caching internally.
 
-        Quarantines stale HNSW segments **once per palace per process**. See
+        Quarantines HNSW segments **once per palace per process**. See
         :attr:`_quarantined_paths` for the rationale (cold-start protection
         vs. runtime thrash on steady-write daemons).
         """
         _fix_blob_seq_ids(palace_path)
         if palace_path not in ChromaBackend._quarantined_paths:
+            quarantine_invalid_hnsw_metadata(palace_path)
             quarantine_stale_hnsw(palace_path)
             ChromaBackend._quarantined_paths.add(palace_path)
         return chromadb.PersistentClient(path=palace_path)
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 5efa71b..cbbcdef 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,4 +1,5 @@
 import os
+import pickle
 import sqlite3
 from pathlib import Path
 
@@ -18,6 +19,7 @@ from mempalace.backends.chroma import (
     ChromaCollection,
     _fix_blob_seq_ids,
     _pin_hnsw_threads,
+    quarantine_invalid_hnsw_metadata,
     quarantine_stale_hnsw,
 )
 
@@ -708,7 +710,10 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
     """Quarantine fires on first ``make_client()`` for a palace, then is
     skipped on subsequent calls — prevents runtime thrash where a daemon's
     own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
-    making the mtime heuristic falsely trigger every reconnect."""
+    making the mtime heuristic falsely trigger every reconnect.
+
+    Invalid metadata quarantine shares the same cold-start gate here; the
+    more aggressive refresh path lives in ``_client()``."""
     from mempalace.backends.chroma import ChromaBackend
 
     palace_path = str(tmp_path / "palace")
@@ -730,9 +735,37 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
     ChromaBackend.make_client(palace_path)
     ChromaBackend.make_client(palace_path)
 
-    assert calls == [
-        palace_path
-    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    assert calls == [palace_path], (
+        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    )
+
+
+def test_make_client_gates_invalid_metadata_on_first_call(tmp_path, monkeypatch):
+    """Invalid metadata quarantine is gated on the first make_client() call."""
+    from mempalace.backends.chroma import ChromaBackend
+
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _invalid(path, *args, **kwargs):
+        calls.append(path)
+        return []
+
+    def _stale(path, stale_seconds=300.0):
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _invalid)
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _stale)
+
+    ChromaBackend.make_client(palace_path)
+    ChromaBackend.make_client(palace_path)
+
+    assert calls == [palace_path]
 
 
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
@@ -811,3 +844,245 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
     )
 
     assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
+
+
+def test_quarantine_invalid_hnsw_metadata_renames_missing_dimensionality(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {"a": 1}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_allows_uninitialized_segment(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_dict_id_to_label(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": 8, "id_to_label": ["a", "b"]}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_schema_payload(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(["not", "a", "metadata", "object"], f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def _dangerous_pickle_payload_executed():
+    raise AssertionError("unsafe pickle payload executed")
+
+
+class _DangerousPickle:
+    def __reduce__(self):
+        return (_dangerous_pickle_payload_executed, ())
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_unsafe_pickle(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(_DangerousPickle(), f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_transient_read_errors(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(EOFError("flush in progress")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_truncated_pickle(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(pickle.UnpicklingError("pickle data was truncated")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_chroma_backend_preflights_metadata_before_persistent_client(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]
+
+
+def test_chroma_backend_stale_quarantine_is_cold_start_only_on_refresh(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (1, 2.0), (1, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+    ]
+
+
+def test_chroma_backend_requarantines_after_inode_replacement(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (2, 2.0), (2, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]

From 7fa27bd23101f0c7dc4fa33f84be0b1d9710dbe2 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 10/60] fix(repair): rebuild collections through temp staging

---
 mempalace/repair.py  | 159 +++++++++++++++++-----
 tests/test_repair.py | 314 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 436 insertions(+), 37 deletions(-)

diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..49d6abe 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -37,10 +37,13 @@ import time
 from datetime import datetime
 from typing import Optional
 
+from chromadb.errors import NotFoundError as ChromaNotFoundError
+
 from .backends.chroma import ChromaBackend, hnsw_capacity_status
 
 
 COLLECTION_NAME = "mempalace_drawers"
+REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp"
 
 
 def _get_palace_path():
@@ -83,6 +86,108 @@ def _paginate_ids(col, where=None):
     return ids
 
 
+def _extract_drawers(col, total: int, batch_size: int):
+    all_ids = []
+    all_docs = []
+    all_metas = []
+    offset = 0
+    while offset < total:
+        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
+        if not batch["ids"]:
+            break
+        all_ids.extend(batch["ids"])
+        all_docs.extend(batch["documents"])
+        all_metas.extend(batch["metadatas"])
+        offset += len(batch["ids"])
+    return all_ids, all_docs, all_metas
+
+
+def _verify_collection_count(col, expected: int, label: str) -> None:
+    actual = col.count()
+    if actual != expected:
+        raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}")
+
+
+def _is_missing_collection_value_error(exc: ValueError) -> bool:
+    message = str(exc).lower()
+    return "does not exist" in message or "not found" in message
+
+
+def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None:
+    try:
+        backend.delete_collection(palace_path, collection_name)
+    except ValueError as exc:
+        if _is_missing_collection_value_error(exc):
+            return
+        raise
+    except (FileNotFoundError, ChromaNotFoundError):
+        return
+
+
+class RebuildCollectionError(RuntimeError):
+    """Raised when temp rebuild fails, carrying whether the live swap happened."""
+
+    def __init__(self, message: str, *, live_replaced: bool):
+        super().__init__(message)
+        self.live_replaced = live_replaced
+
+
+def _rebuild_collection_via_temp(
+    backend,
+    palace_path: str,
+    all_ids,
+    all_docs,
+    all_metas,
+    batch_size: int,
+    progress=print,
+) -> int:
+    expected = len(all_ids)
+    temp_name = REPAIR_TEMP_COLLECTION
+    live_replaced = False
+
+    try:
+        _delete_collection_if_exists(backend, palace_path, temp_name)
+
+        progress(f"  Building temporary collection: {temp_name}")
+        temp_col = backend.create_collection(palace_path, temp_name)
+        staged = 0
+        for i in range(0, expected, batch_size):
+            batch_ids = all_ids[i : i + batch_size]
+            batch_docs = all_docs[i : i + batch_size]
+            batch_metas = all_metas[i : i + batch_size]
+            temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
+            staged += len(batch_ids)
+            progress(f"  Staged {staged}/{expected} drawers...")
+        _verify_collection_count(temp_col, expected, "temporary rebuild")
+
+        progress("  Rebuilding live collection...")
+        backend.delete_collection(palace_path, COLLECTION_NAME)
+        live_replaced = True
+        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+
+        rebuilt = 0
+        for i in range(0, expected, batch_size):
+            batch_ids = all_ids[i : i + batch_size]
+            batch_docs = all_docs[i : i + batch_size]
+            batch_metas = all_metas[i : i + batch_size]
+            new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
+            rebuilt += len(batch_ids)
+            progress(f"  Re-filed {rebuilt}/{expected} drawers...")
+        _verify_collection_count(new_col, expected, "rebuilt live collection")
+
+        try:
+            _delete_collection_if_exists(backend, palace_path, temp_name)
+        except Exception:
+            pass
+        return rebuilt
+    except Exception as exc:
+        try:
+            _delete_collection_if_exists(backend, palace_path, temp_name)
+        except Exception:
+            pass
+        raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
+
+
 def scan_palace(palace_path=None, only_wing=None):
     """Scan the palace for corrupt/unfetchable IDs.
 
@@ -373,18 +478,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     # Extract all drawers in batches
     print("\n  Extracting drawers...")
     batch_size = 5000
-    all_ids = []
-    all_docs = []
-    all_metas = []
-    offset = 0
-    while offset < total:
-        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
-        if not batch["ids"]:
-            break
-        all_ids.extend(batch["ids"])
-        all_docs.extend(batch["documents"])
-        all_metas.extend(batch["metadatas"])
-        offset += len(batch["ids"])
+    all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
     print(f"  Extracted {len(all_ids)} drawers")
 
     # ── #1208 guard ──────────────────────────────────────────────────
@@ -407,28 +501,33 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
 
     # Rebuild with correct HNSW settings
     print("  Rebuilding collection with hnsw:space=cosine...")
-    backend.delete_collection(palace_path, COLLECTION_NAME)
-    new_col = backend.create_collection(palace_path, COLLECTION_NAME)
-
-    filed = 0
     try:
-        for i in range(0, len(all_ids), batch_size):
-            batch_ids = all_ids[i : i + batch_size]
-            batch_docs = all_docs[i : i + batch_size]
-            batch_metas = all_metas[i : i + batch_size]
-            new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
-            filed += len(batch_ids)
-            print(f"  Re-filed {filed}/{len(all_ids)} drawers...")
-    except Exception as e:
+        filed = _rebuild_collection_via_temp(
+            backend,
+            palace_path,
+            all_ids,
+            all_docs,
+            all_metas,
+            batch_size,
+            progress=print,
+        )
+    except RebuildCollectionError as e:
         print(f"\n  ERROR during rebuild: {e}")
-        print(f"  Only {filed}/{len(all_ids)} drawers were re-filed.")
-        if os.path.exists(backup_path):
+        print("  Rebuild aborted before completion.")
+        if e.live_replaced and os.path.exists(backup_path):
             print(f"  Restoring from backup: {backup_path}")
-            backend.delete_collection(palace_path, COLLECTION_NAME)
-            shutil.copy2(backup_path, sqlite_path)
-            print("  Backup restored. Palace is back to pre-repair state.")
-        else:
+            try:
+                _close_chroma_handles(palace_path)
+                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                shutil.copy2(backup_path, sqlite_path)
+                print("  Backup restored. Palace is back to pre-repair state.")
+            except Exception as restore_error:
+                print(f"  Backup restore failed: {restore_error}")
+                print(f"  Manual restore required from: {backup_path}")
+        elif e.live_replaced:
             print("  No backup available. Re-mine from source files to recover.")
+        else:
+            print("  Live collection was not replaced; leaving the original palace untouched.")
         raise
 
     print(f"\n  Repair complete. {filed} drawers rebuilt.")
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..33daad9 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -2,7 +2,7 @@
 
 import os
 import sqlite3
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import pytest
 
@@ -229,8 +229,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
     }
 
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
     mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
 
     repair.rebuild_index(palace_path=str(tmp_path))
 
@@ -239,14 +242,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
     assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
 
     # Verify: deleted and recreated (cosine is the backend default)
-    mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
-    mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
 
     # Verify: used upsert not add
+    mock_temp_col.upsert.assert_called_once()
     mock_new_col.upsert.assert_called_once()
     mock_new_col.add.assert_not_called()
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_missing_temp_collection_at_start(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"),
+        None,
+        None,
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+def test_delete_collection_if_exists_reraises_unexpected_value_error():
+    mock_backend = MagicMock()
+    mock_backend.delete_collection.side_effect = ValueError("invalid collection name")
+
+    with pytest.raises(ValueError, match="invalid collection name"):
+        repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name")
+
+
 @patch("mempalace.repair.shutil")
 @patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
@@ -365,19 +428,256 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp
         },
         {"ids": [], "documents": [], "metadatas": []},
     ]
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 10_000
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 10_000
     mock_backend.get_collection.return_value = mock_col
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
     mock_backend_cls.return_value = mock_backend
 
     with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
         repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
 
-    mock_backend.delete_collection.assert_called_once()
-    mock_backend.create_collection.assert_called_once()
+    assert mock_backend.delete_collection.call_count == 3
+    assert mock_backend.create_collection.call_count == 2
+    mock_temp_col.upsert.assert_called()
     mock_new_col.upsert.assert_called()
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 1
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.return_value = mock_temp_col
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is False
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_delete_missing_still_restores_backup(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        None,
+        repair.ChromaNotFoundError("missing"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_restore_failure_preserves_original_error(
+    mock_backend_cls, mock_shutil, tmp_path, capsys
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _copy2_side_effect(src, dst):
+        if str(src).endswith(".backup"):
+            raise PermissionError("locked sqlite")
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _copy2_side_effect
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    out = capsys.readouterr().out
+    assert "locked sqlite" in out
+    assert "Manual restore required" in out
+    assert "live upsert failed" in str(excinfo.value)
+
+
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails(
+    mock_backend_cls,
+):
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair._rebuild_collection_via_temp(
+            mock_backend,
+            "/palace",
+            ["id1", "id2"],
+            ["doc1", "doc2"],
+            [{"wing": "a"}, {"wing": "b"}],
+            batch_size=5000,
+            progress=lambda *args, **kwargs: None,
+        )
+
+    assert "live build failed" in str(excinfo.value)
+    assert excinfo.value.live_replaced is True
+    assert mock_backend.delete_collection.call_args_list == [
+        call("/palace", "mempalace_drawers__repair_tmp"),
+        call("/palace", "mempalace_drawers"),
+        call("/palace", "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
 # ── repair_max_seq_id ─────────────────────────────────────────────────
 
 
From 2f509b4789d7ae44f8fbf1281df812e098207aa3 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 11/60] fix(cli): restore backup on repair failure

---
 mempalace/cli.py  | 62 +++++++++++++++++++++++++++--------------------
 tests/test_cli.py | 50 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index ca9798b..27c81d3 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -648,7 +648,14 @@ def cmd_repair(args):
     import shutil
     from .backends.chroma import ChromaBackend
     from .migrate import confirm_destructive_action, contains_palace_database
-    from .repair import TruncationDetected, check_extraction_safety
+    from .repair import (
+        RebuildCollectionError,
+        TruncationDetected,
+        _close_chroma_handles,
+        _extract_drawers,
+        _rebuild_collection_via_temp,
+        check_extraction_safety,
+    )
 
     palace_path = os.path.abspath(
         os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -705,18 +712,7 @@ def cmd_repair(args):
     # Extract all drawers in batches
     print("\n  Extracting drawers...")
     batch_size = 5000
-    all_ids = []
-    all_docs = []
-    all_metas = []
-    offset = 0
-    while offset < total:
-        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
-        if not batch["ids"]:
-            break
-        all_ids.extend(batch["ids"])
-        all_docs.extend(batch["documents"])
-        all_metas.extend(batch["metadatas"])
-        offset += len(batch["ids"])
+    all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
     print(f"  Extracted {len(all_ids)} drawers")
 
     # ── #1208 guard ──────────────────────────────────────────────────
@@ -736,7 +732,6 @@ def cmd_repair(args):
         print(e.message)
         return
 
-    # Backup and rebuild
     palace_path = os.path.normpath(palace_path)
     backup_path = palace_path + ".backup"
     if os.path.exists(backup_path):
@@ -750,18 +745,33 @@ def cmd_repair(args):
     print(f"  Backing up to {backup_path}...")
     shutil.copytree(palace_path, backup_path)
 
-    print("  Rebuilding collection...")
-    backend.delete_collection(palace_path, "mempalace_drawers")
-    new_col = backend.create_collection(palace_path, "mempalace_drawers")
-
-    filed = 0
-    for i in range(0, len(all_ids), batch_size):
-        batch_ids = all_ids[i : i + batch_size]
-        batch_docs = all_docs[i : i + batch_size]
-        batch_metas = all_metas[i : i + batch_size]
-        new_col.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
-        filed += len(batch_ids)
-        print(f"  Re-filed {filed}/{len(all_ids)} drawers...")
+    try:
+        filed = _rebuild_collection_via_temp(
+            backend,
+            palace_path,
+            all_ids,
+            all_docs,
+            all_metas,
+            batch_size,
+            progress=print,
+        )
+    except RebuildCollectionError as e:
+        print(f"  Repair failed: {e}")
+        if getattr(e, "live_replaced", False):
+            print("  Live collection was already replaced; restoring from backup...")
+            try:
+                _close_chroma_handles(palace_path)
+                if os.path.exists(palace_path):
+                    shutil.rmtree(palace_path)
+                shutil.copytree(backup_path, palace_path)
+                print(f"  Restore complete from backup: {backup_path}")
+            except Exception as restore_error:
+                print(f"  Automatic restore failed: {restore_error}")
+                print("  Manual recovery required:")
+                print(f"    1. Remove or rename the broken directory: {palace_path}")
+                print(f"    2. Restore the backup directory to: {palace_path}")
+                print(f"       Backup location: {backup_path}")
+        sys.exit(1)
 
     print(f"\n  Repair complete. {filed} drawers rebuilt.")
     print(f"  Backup saved at {backup_path}")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index af7b39d..11845fe 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -4,7 +4,7 @@ import argparse
 import shlex
 import sys
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import pytest
 
@@ -760,13 +760,61 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
         "documents": ["doc1", "doc2"],
         "metadatas": [{"wing": "a"}, {"wing": "b"}],
     }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
     mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
     with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
         cmd_repair(args)
     out = capsys.readouterr().out
     assert "Repair complete" in out
     assert "2 drawers rebuilt" in out
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]
+    mock_temp_col.upsert.assert_called_once()
+    mock_new_col.upsert.assert_called_once()
+    mock_new_col.add.assert_not_called()
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    with (
+        patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend),
+        patch("mempalace.repair._close_chroma_handles") as mock_close_handles,
+    ):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    out = capsys.readouterr().out
+    assert excinfo.value.code == 1
+    assert "Repair failed" in out
+    assert "restoring from backup" in out
+    mock_close_handles.assert_called_once_with(str(palace_dir))
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]
 
 
 @patch("mempalace.cli.MempalaceConfig")

From f57f30025f2551efc3d917a368f5ed0dae336c46 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:57:39 -0600
Subject: [PATCH 12/60] fix(repair): close active backend before rollback
 restore

Rollback cleanup was instantiating a fresh ChromaBackend, so the live backend that had opened the PersistentClient could keep file handles alive during restore. Close the active backend instance instead so rollback and CLI recovery can release Windows-safe locks before copying the backup back into place.
---
 mempalace/cli.py     |  2 +-
 mempalace/repair.py  | 14 ++++++++++----
 tests/test_cli.py    |  7 ++-----
 tests/test_repair.py | 11 ++++++++---
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 27c81d3..9a1e8e4 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -760,7 +760,7 @@ def cmd_repair(args):
         if getattr(e, "live_replaced", False):
             print("  Live collection was already replaced; restoring from backup...")
             try:
-                _close_chroma_handles(palace_path)
+                _close_chroma_handles(palace_path, backend=backend)
                 if os.path.exists(palace_path):
                     shutil.rmtree(palace_path)
                 shutil.copytree(backup_path, palace_path)
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 49d6abe..0585405 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -517,7 +517,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
         if e.live_replaced and os.path.exists(backup_path):
             print(f"  Restoring from backup: {backup_path}")
             try:
-                _close_chroma_handles(palace_path)
+                _close_chroma_handles(palace_path, backend=backend)
                 _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
                 shutil.copy2(backup_path, sqlite_path)
                 print("  Backup restored. Palace is back to pre-repair state.")
@@ -593,12 +593,18 @@ def status(palace_path=None) -> dict:
 # ---------------------------------------------------------------------------
 
 
-def _close_chroma_handles(palace_path: str) -> None:
-    """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release."""
+def _close_chroma_handles(palace_path: str, backend: ChromaBackend | None = None) -> None:
+    """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
+
+    When ``backend`` is provided, close the live instance so rollback/restore
+    releases the handles it was already using. Otherwise fall back to a
+    transient backend instance for the max-seq-id repair path.
+    """
     import gc
 
     try:
-        ChromaBackend().close_palace(palace_path)
+        closer = backend if backend is not None else ChromaBackend()
+        closer.close_palace(palace_path)
     except Exception:
         pass
     try:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 11845fe..6572f1d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -799,17 +799,14 @@ def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp
     mock_temp_col.count.return_value = 2
     mock_backend = _mock_backend_for(col=mock_col)
     mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
-    with (
-        patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend),
-        patch("mempalace.repair._close_chroma_handles") as mock_close_handles,
-    ):
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
         with pytest.raises(SystemExit) as excinfo:
             cmd_repair(args)
     out = capsys.readouterr().out
     assert excinfo.value.code == 1
     assert "Repair failed" in out
     assert "restoring from backup" in out
-    mock_close_handles.assert_called_once_with(str(palace_dir))
+    mock_backend.close_palace.assert_called_once_with(str(palace_dir))
     assert mock_backend.delete_collection.call_args_list == [
         call(str(palace_dir), "mempalace_drawers__repair_tmp"),
         call(str(palace_dir), "mempalace_drawers"),
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 33daad9..9cd12dd 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -499,20 +499,25 @@ def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shuti
     mock_temp_col.count.return_value = 2
     mock_new_col = MagicMock()
     mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
-    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
-    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    active_backend = MagicMock()
+    active_backend.get_collection.return_value = mock_col
+    active_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    helper_backend = MagicMock()
+    mock_backend_cls.side_effect = [active_backend, helper_backend]
 
     with pytest.raises(repair.RebuildCollectionError) as excinfo:
         repair.rebuild_index(palace_path=str(tmp_path))
 
     assert excinfo.value.live_replaced is True
     assert mock_shutil.copy2.call_count == 2
-    assert mock_backend.delete_collection.call_args_list == [
+    assert active_backend.delete_collection.call_args_list == [
         call(str(tmp_path), "mempalace_drawers__repair_tmp"),
         call(str(tmp_path), "mempalace_drawers"),
         call(str(tmp_path), "mempalace_drawers__repair_tmp"),
         call(str(tmp_path), "mempalace_drawers"),
     ]
+    active_backend.close_palace.assert_called_once_with(str(tmp_path))
+    helper_backend.close_palace.assert_not_called()
 
 
 @patch("mempalace.repair.shutil")

From 0e32b9643c3d4ceca36b6f610ddbcbbf58ad8dd7 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 21:47:08 -0600
Subject: [PATCH 13/60] fix: avoid false hnsw divergence fallback

---
 mempalace/backends/chroma.py |  27 +++---
 mempalace/searcher.py        |  43 ++++++++--
 tests/test_hnsw_capacity.py  | 160 ++++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 26 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 646969b..7ce8fd8 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -491,22 +491,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra
         divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
 
         if hnsw_count is None:
-            # No pickle yet — segment hasn't persisted metadata. Could be
-            # fresh-but-unflushed (normal) or interrupted-mid-flush (bad).
-            # We can't distinguish without the pickle, so only flag
-            # divergence when sqlite holds clearly more than two flush
-            # windows worth — same threshold as the with-pickle path.
-            if sqlite_count > divergence_floor:
-                out["status"] = "diverged"
-                out["diverged"] = True
-                out["divergence"] = sqlite_count
-                out["message"] = (
-                    f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment "
-                    "has never flushed metadata — vector search will return nothing "
-                    "until the segment is rebuilt. Run `mempalace repair`."
-                )
-            else:
-                out["message"] = "HNSW segment metadata not yet flushed; skipping"
+            # No pickle yet, so this probe cannot measure HNSW capacity.
+            # Chroma 1.5.x can have binary HNSW files without a flushed
+            # metadata pickle; absence of the pickle alone is not proof that
+            # vector search is unusable or dangerous. Keep the status unknown
+            # so MCP does not globally disable vectors on an inconclusive
+            # signal. Corrupt/invalid metadata, when present, is handled by
+            # quarantine_invalid_hnsw_metadata before Chroma opens.
+            out["message"] = (
+                "HNSW capacity unavailable: metadata has not been flushed; "
+                "leaving vector search enabled"
+            )
             return out
 
         divergence = sqlite_count - hnsw_count
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index a14d90d..4ff0f23 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -396,6 +396,31 @@ def _bm25_only_via_sqlite(
             "hint": "Run: mempalace init <dir> && mempalace mine <dir>",
         }
 
+    def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
+        clauses = []
+        params = []
+        for key, value in (("wing", wing), ("room", room)):
+            if not value:
+                continue
+            clauses.append(
+                f"""
+                AND EXISTS (
+                    SELECT 1
+                    FROM embedding_metadata mf
+                    WHERE mf.id = {row_id_expr}
+                      AND mf.key = ?
+                      AND COALESCE(
+                        mf.string_value,
+                        CAST(mf.int_value AS TEXT),
+                        CAST(mf.float_value AS TEXT),
+                        CAST(mf.bool_value AS TEXT)
+                      ) = ?
+                )
+                """
+            )
+            params.extend([key, value])
+        return "".join(clauses), params
+
     try:
         conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
     except sqlite3.Error as e:
@@ -408,15 +433,17 @@ def _bm25_only_via_sqlite(
         candidate_ids: list[int] = []
         if tokens:
             fts_query = " OR ".join(tokens)
+            filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
             try:
                 rows = conn.execute(
-                    """
+                    f"""
                     SELECT rowid
                     FROM embedding_fulltext_search
                     WHERE embedding_fulltext_search MATCH ?
+                    {filter_sql}
                     LIMIT ?
                     """,
-                    (fts_query, max_candidates),
+                    (fts_query, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -434,17 +461,19 @@ def _bm25_only_via_sqlite(
             # fall back to ordering by primary-key id and finally to an
             # empty result rather than letting search raise.
             try:
+                filter_sql, filter_params = _metadata_filter_sql("e.id")
                 rows = conn.execute(
-                    """
+                    f"""
                     SELECT e.id
                     FROM embeddings e
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
                     WHERE c.name = 'mempalace_drawers'
+                    {filter_sql}
                     ORDER BY e.created_at DESC
                     LIMIT ?
                     """,
-                    (max_candidates,),
+                    (*filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -453,17 +482,19 @@ def _bm25_only_via_sqlite(
                     exc_info=True,
                 )
                 try:
+                    filter_sql, filter_params = _metadata_filter_sql("e.id")
                     rows = conn.execute(
-                        """
+                        f"""
                         SELECT e.id
                         FROM embeddings e
                         JOIN segments s ON e.segment_id = s.id
                         JOIN collections c ON s.collection = c.id
                         WHERE c.name = 'mempalace_drawers'
+                        {filter_sql}
                         ORDER BY e.id DESC
                         LIMIT ?
                         """,
-                        (max_candidates,),
+                        (*filter_params, max_candidates),
                     ).fetchall()
                     candidate_ids = [r[0] for r in rows]
                 except sqlite3.Error:
diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py
index 512fc9c..912def8 100644
--- a/tests/test_hnsw_capacity.py
+++ b/tests/test_hnsw_capacity.py
@@ -238,14 +238,39 @@ def test_capacity_status_tolerates_flush_lag(tmp_path):
     assert info["status"] == "ok"
 
 
-def test_capacity_status_flags_unflushed_with_large_sqlite(tmp_path):
-    """No pickle + many sqlite rows is its own divergence signal."""
+def test_capacity_status_does_not_flag_unflushed_with_large_sqlite(tmp_path):
+    """No pickle + many sqlite rows is inconclusive, not divergence."""
     seg = "seg-noflush"
     _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
     info = hnsw_capacity_status(str(tmp_path), COLLECTION)
-    assert info["diverged"] is True
+    assert info["diverged"] is False
+    assert info["status"] == "unknown"
+    assert info["divergence"] is None
     assert info["hnsw_count"] is None
-    assert "never flushed" in info["message"]
+    assert "capacity unavailable" in info["message"]
+    assert "leaving vector search enabled" in info["message"]
+
+
+def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, monkeypatch):
+    """The MCP preflight must not route all searches to BM25 on this signal."""
+    from mempalace import mcp_server
+
+    seg = "seg-mcp-noflush"
+    _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
+
+    class _Cfg:
+        palace_path = str(tmp_path)
+
+    monkeypatch.setattr(mcp_server, "_config", _Cfg())
+    monkeypatch.setattr(mcp_server, "_vector_disabled", True)
+    monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "old divergence")
+
+    mcp_server._refresh_vector_disabled_flag()
+
+    assert mcp_server._vector_disabled is False
+    assert mcp_server._vector_disabled_reason == ""
+    assert mcp_server._vector_capacity_status["status"] == "unknown"
+    assert "leaving vector search enabled" in mcp_server._vector_capacity_status["message"]
 
 
 def test_capacity_status_quiet_for_empty_palace(tmp_path):
@@ -372,6 +397,17 @@ def _seed_drawers(palace: str, segment_id: str, drawers: list[tuple[str, dict, s
         conn.close()
 
 
+def _set_drawer_created_at(palace: str, timestamps: dict[int, str]) -> None:
+    db_path = os.path.join(palace, "chroma.sqlite3")
+    conn = sqlite3.connect(db_path)
+    try:
+        for emb_id, created_at in timestamps.items():
+            conn.execute("UPDATE embeddings SET created_at = ? WHERE id = ?", (created_at, emb_id))
+        conn.commit()
+    finally:
+        conn.close()
+
+
 @pytest.fixture
 def palace_with_drawers(tmp_path):
     seg = "seg-bm25"
@@ -417,6 +453,122 @@ def test_bm25_fallback_filters_by_wing(palace_with_drawers):
     assert all(r["wing"] == "design" for r in out["results"])
 
 
+def test_bm25_fallback_applies_wing_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-fts-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+            (
+                "shared token inside target wing",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_applies_room_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-room-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token wrong room",
+                {"wing": "project", "room": "scratch", "source_file": "/x/scratch.md"},
+                "d-1",
+            ),
+            (
+                "shared token right room",
+                {"wing": "project", "room": "diary", "source_file": "/x/diary.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite(
+        "shared token",
+        str(tmp_path),
+        wing="project",
+        room="diary",
+        max_candidates=1,
+    )
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+    assert out["results"][0]["room"] == "diary"
+
+
+def test_bm25_fallback_applies_wing_before_recency_candidate_limit(tmp_path):
+    seg = "seg-bm25-recency-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "target drawer for short query",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-1",
+            ),
+            (
+                "newer drawer outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-2",
+            ),
+        ],
+    )
+    _set_drawer_created_at(
+        str(tmp_path),
+        {
+            1: "2026-01-01 00:00:00",
+            2: "2026-02-01 00:00:00",
+        },
+    )
+
+    out = _bm25_only_via_sqlite("a", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_returns_empty_when_filtered_wing_has_no_candidates(tmp_path):
+    seg = "seg-bm25-empty-filter"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 0
+    assert out["results"] == []
+
+
 def test_bm25_fallback_no_palace(tmp_path):
     out = _bm25_only_via_sqlite("anything", str(tmp_path))
     assert "error" in out

From 8472d553a34b83cb3ba6d50142640a8d2744d80c Mon Sep 17 00:00:00 2001
From: lcatlett <lindsey.catlett@gmail.com>
Date: Fri, 1 May 2026 19:26:43 -0400
Subject: [PATCH 14/60] fix(hooks): treat absent ~/.mempalace as auto-save off

When the user removes ~/.mempalace/ (a strong "do not auto-capture"
signal), the next hook fire would silently recreate the entire dir
hierarchy and ingest existing transcripts:

1. _log() at hooks_cli.py:148 unconditionally calls
   STATE_DIR.mkdir(parents=True, exist_ok=True), so the act of
   writing the hook log line recreated ~/.mempalace/hook_state/
2. With no config file present, hook_stop_auto_save and
   hook_precompact_auto_save defaulted to True (no override to read)
3. The full save path then ran, materializing palace/, wal/,
   knowledge_graph.sqlite3, and N drawers from existing transcripts
   in ~/.claude/projects/*.jsonl

All four entry points (hook_stop, hook_precompact, hook_session_start,
and _log itself) now check a new PALACE_ROOT = Path.home() / ".mempalace"
constant first and short-circuit (returning {} on stdout, never logging)
when the dir is absent. The user-removable directory is now a kill-switch.

Five unit tests in tests/test_hooks_cli.py cover: hook_stop /
hook_precompact / hook_session_start do not create the dir when absent;
_log() does not create it when absent; existing dir proceeds normally
(regression).

Caught in the wild on a downstream fork: ~146 drawers materialized in
under a second after a deliberate `rm -rf ~/.mempalace/`, into a planning
session that was explicitly not meant to be captured.
---
 CHANGELOG.md            |  1 +
 mempalace/hooks_cli.py  | 23 +++++++++++++
 tests/test_hooks_cli.py | 76 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41dfaac..7a75842 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 ### Bug Fixes
 
 - **MCP server `tool_diary_write` SIGSEGV when default EF provider differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x persists the EF *identity* (its `name()`) with the collection but not the EF *instance/configuration*, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` — its `name()` matches `mempalace.embedding`'s spoofed `"default"` so the identity check passes, but its provider list is chromadb's default rather than the user's resolved device. The miner / Stop hook ingest path routes through the backend helper and binds the configured EF instead. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now reuses `ChromaBackend._resolve_embedding_function()` on the reopen branches that actually open a collection (warm-cache reads stay zero-cost), matching the miner/backend path. (#1299, follow-up to #1262 / #1289)
+- **Hooks no longer recreate `~/.mempalace/` after the user removes it.** When `~/.mempalace/` is deleted (a strong "do not auto-capture" signal), the next `Stop`, `PreCompact`, or `SessionStart` hook would silently rebuild the dir hierarchy and ingest existing transcripts: `_log()` called `STATE_DIR.mkdir(parents=True, exist_ok=True)` unconditionally, so the very act of writing `[HH:MM] SESSION START …` recreated `~/.mempalace/hook_state/`; subsequent calls in the save path then materialized `palace/`, `wal/`, `knowledge_graph.sqlite3`, and N drawers from `~/.claude/projects/*.jsonl`. All four entry points (`hook_stop`, `hook_precompact`, `hook_session_start`, and `_log` itself) now check a new module-level `PALACE_ROOT = Path.home() / ".mempalace"` constant first and short-circuit (returning `{}` on stdout, never logging) when the directory is absent. The user-removable directory becomes a kill-switch — `rm -rf ~/.mempalace` is now a stable state. Net: 23 lines added in `mempalace/hooks_cli.py`, 5 unit tests in `tests/test_hooks_cli.py`. (#1305)
 - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180)
 - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
 - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index d4f8317..ca8fb60 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -16,6 +16,18 @@ from pathlib import Path
 
 SAVE_INTERVAL = 15
 STATE_DIR = Path.home() / ".mempalace" / "hook_state"
+PALACE_ROOT = Path.home() / ".mempalace"
+
+
+def _palace_root_exists() -> bool:
+    """User-removable kill-switch.
+
+    If ~/.mempalace/ does not exist, the user has explicitly cleared it.
+    All hook side effects (logging, state dir creation, mining, ingestion)
+    must respect this and short-circuit BEFORE touching disk — including
+    before logging the short-circuit itself.
+    """
+    return PALACE_ROOT.exists()
 
 
 def _mempalace_python() -> str:
@@ -142,6 +154,8 @@ _state_dir_initialized = False
 
 def _log(message: str):
     """Append to hook state log file."""
+    if not PALACE_ROOT.exists():
+        return  # User removed the palace; do not recreate by logging
     global _state_dir_initialized
     try:
         if not _state_dir_initialized:
@@ -550,6 +564,9 @@ def _wing_from_transcript_path(transcript_path: str) -> str:
 
 def hook_stop(data: dict, harness: str):
     """Stop hook: block every N messages for auto-save."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
     stop_hook_active = parsed["stop_hook_active"]
@@ -659,6 +676,9 @@ def hook_stop(data: dict, harness: str):
 
 def hook_session_start(data: dict, harness: str):
     """Session start hook: initialize session tracking state."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
 
@@ -673,6 +693,9 @@ def hook_session_start(data: dict, harness: str):
 
 def hook_precompact(data: dict, harness: str):
     """Precompact hook: mine transcript synchronously, then allow compaction."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
     transcript_path = parsed["transcript_path"]
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 1ceb530..941288d 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -959,3 +959,79 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
     # The injected value is not "true"/"1"/"yes", so the hook should NOT pass through.
     # Save must have been attempted.
     assert mock_save.called
+
+
+# --- Absent palace root: hooks must not recreate ~/.mempalace ---
+#
+# When the user removes ~/.mempalace (e.g. `rm -rf`), that is the strongest
+# possible "do not auto-capture" signal. Hooks must short-circuit BEFORE
+# touching disk — including before the log-line that previously triggered
+# STATE_DIR.mkdir() on its own.
+
+
+import mempalace.hooks_cli as hooks_cli_mod
+
+
+def _redirect_palace_root(monkeypatch, tmp_path):
+    """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
+    fake_root = tmp_path / "absent-mempalace"
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    return fake_root
+
+
+def test_hook_stop_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_stop(
+            {"session_id": "absent", "transcript_path": str(transcript), "stop_hook_active": False},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_precompact_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_precompact(
+            {"session_id": "absent", "transcript_path": str(transcript)},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_session_start_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "absent"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_log_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    _log("test message")
+    assert not fake_root.exists()
+
+
+def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
+    """Regression: when PALACE_ROOT exists, hooks must proceed (no short-circuit)."""
+    fake_root = tmp_path / "present-mempalace"
+    fake_root.mkdir()
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    _log("test message")
+    # _log should have created the state dir under the existing palace root
+    assert (fake_root / "hook_state").exists()
+    assert (fake_root / "hook_state" / "hook.log").is_file()

From 2d50b214d4c2c05112a3258faa8bef39cf6c07c0 Mon Sep 17 00:00:00 2001
From: lcatlett <lindsey.catlett@gmail.com>
Date: Sat, 2 May 2026 20:37:47 -0400
Subject: [PATCH 15/60] fix(hooks): use is_dir() for palace root check (review
 feedback)

Both @igorls and the Qodo bot flagged that `_palace_root_exists()` used
`Path.exists()`, which returns True for a regular file. A stray file at
`~/.mempalace` would let the kill-switch be bypassed and crash later in
`STATE_DIR.mkdir()` with NotADirectoryError.

Switched to `Path.is_dir()`. Also fold `_log()`'s inline check through
`_palace_root_exists()` so both kill-switch sites use the same predicate.

New test pins the behavior: a regular file at the palace root path is
treated as absent (hook short-circuits, _log does not crash, the stray
file is left untouched).
---
 mempalace/hooks_cli.py  |  9 +++++++--
 tests/test_hooks_cli.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index ca8fb60..8498103 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -26,8 +26,13 @@ def _palace_root_exists() -> bool:
     All hook side effects (logging, state dir creation, mining, ingestion)
     must respect this and short-circuit BEFORE touching disk — including
     before logging the short-circuit itself.
+
+    Uses ``is_dir()`` rather than ``exists()`` so a stray regular file at
+    ``~/.mempalace`` (or a broken symlink) is treated as absent — otherwise
+    the kill-switch would be bypassed and ``STATE_DIR.mkdir()`` would later
+    crash on ``NotADirectoryError``.
     """
-    return PALACE_ROOT.exists()
+    return PALACE_ROOT.is_dir()
 
 
 def _mempalace_python() -> str:
@@ -154,7 +159,7 @@ _state_dir_initialized = False
 
 def _log(message: str):
     """Append to hook state log file."""
-    if not PALACE_ROOT.exists():
+    if not _palace_root_exists():
         return  # User removed the palace; do not recreate by logging
     global _state_dir_initialized
     try:
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 941288d..487acf7 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -1035,3 +1035,35 @@ def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
     # _log should have created the state dir under the existing palace root
     assert (fake_root / "hook_state").exists()
     assert (fake_root / "hook_state" / "hook.log").is_file()
+
+
+def test_regular_file_at_palace_root_treated_as_absent(tmp_path, monkeypatch):
+    """A regular file at ~/.mempalace must be treated the same as absent.
+
+    ``Path.exists()`` returns True for a regular file, which would let the
+    kill-switch be bypassed and crash later when ``STATE_DIR.mkdir()`` runs
+    on ``NotADirectoryError``. ``_palace_root_exists()`` must use
+    ``is_dir()`` so a stray file (or broken symlink) short-circuits cleanly.
+    """
+    fake_root = tmp_path / "file-not-dir"
+    fake_root.write_text("oops, this is a file not a directory")
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+
+    # _palace_root_exists() is the source of truth — it must return False.
+    assert hooks_cli_mod._palace_root_exists() is False
+
+    # Hooks must short-circuit (return {} on stdout) and not touch disk.
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "file-at-root"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+
+    # _log must also short-circuit — it must NOT try to mkdir a path under a
+    # regular file (which would raise NotADirectoryError).
+    _log("test message")  # would raise if not short-circuited
+
+    # The stray file is left untouched; we never try to convert it.
+    assert fake_root.is_file()
+    assert fake_root.read_text() == "oops, this is a file not a directory"

From beac5d99547121ca04ff069dbeaac7619fc741a8 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:46:31 +0500
Subject: [PATCH 16/60] refactor(mcp): replace eager _kg with lazy per-path
 cache (#1136)

Swap the module-level KnowledgeGraph singleton for a lazy, per-path
cache keyed by the resolved sqlite path. Import no longer creates a
sqlite file as a side effect, and MCP servers started with --palace
now route KG calls to the correct tenant when MEMPALACE_PALACE_PATH
changes between calls, matching the per-call behavior of _get_client()
on the ChromaDB side.

Default-path behavior is preserved: without --palace at startup, KG
stays on DEFAULT_KG_PATH regardless of env var. The "no --palace but
env var set" case is #540's scope and is not changed here.
---
 mempalace/mcp_server.py | 52 +++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index e3e89c6..eae048b 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -46,6 +46,7 @@ import argparse  # noqa: E402  (deferred until after stdio protection above)
 import json  # noqa: E402
 import logging  # noqa: E402
 import hashlib  # noqa: E402
+import threading  # noqa: E402
 import time  # noqa: E402
 from datetime import date, datetime  # noqa: E402
 from pathlib import Path  # noqa: E402
@@ -78,7 +79,7 @@ from .palace_graph import (  # noqa: E402
     follow_tunnels,
 )
 
-from .knowledge_graph import KnowledgeGraph  # noqa: E402
+from .knowledge_graph import KnowledgeGraph, DEFAULT_KG_PATH  # noqa: E402
 
 logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
 logger = logging.getLogger("mempalace_mcp")
@@ -103,12 +104,39 @@ if _args.palace:
     os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
 
 _config = MempalaceConfig()
-# Only override KG path when --palace is explicitly provided; otherwise use
-# KnowledgeGraph's default (~/.mempalace/knowledge_graph.sqlite3).
-if _args.palace:
-    _kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
-else:
-    _kg = KnowledgeGraph()
+
+# Lazy per-path KG cache. Import no longer creates the sqlite file as a side
+# effect (see issue #1136). The path is resolved on each tool call so that a
+# multi-tenant host rotating MEMPALACE_PALACE_PATH between calls routes each
+# call to the correct KG file, matching the per-call behavior of _get_client()
+# on the ChromaDB side.
+_kg_by_path: dict[str, KnowledgeGraph] = {}
+_kg_cache_lock = threading.Lock()
+
+# Whether --palace was given at startup. Controls default-path resolution:
+# with the flag, KG follows _config.palace_path per call; without it, KG stays
+# on DEFAULT_KG_PATH regardless of env var (issue #540's territory, out of
+# scope here).
+_palace_flag_given: bool = bool(_args.palace)
+
+
+def _resolve_kg_path() -> str:
+    if _palace_flag_given:
+        return os.path.join(_config.palace_path, "knowledge_graph.sqlite3")
+    return DEFAULT_KG_PATH
+
+
+def _get_kg() -> KnowledgeGraph:
+    path = os.path.abspath(_resolve_kg_path())
+    kg = _kg_by_path.get(path)
+    if kg is not None:
+        return kg
+    with _kg_cache_lock:
+        kg = _kg_by_path.get(path)
+        if kg is None:
+            kg = KnowledgeGraph(db_path=path)
+            _kg_by_path[path] = kg
+    return kg
 
 
 _client_cache = None
@@ -1063,7 +1091,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
         return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
-    results = _kg.query_entity(entity, as_of=as_of, direction=direction)
+    results = _get_kg().query_entity(entity, as_of=as_of, direction=direction)
     return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
 
 
@@ -1108,7 +1136,7 @@ def tool_kg_add(
             "source_drawer_id": source_drawer_id,
         },
     )
-    triple_id = _kg.add_triple(
+    triple_id = _get_kg().add_triple(
         subject,
         predicate,
         object,
@@ -1147,7 +1175,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
             "ended": resolved_ended,
         },
     )
-    _kg.invalidate(subject, predicate, object, ended=resolved_ended)
+    _get_kg().invalidate(subject, predicate, object, ended=resolved_ended)
     return {
         "success": True,
         "fact": f"{subject} → {predicate} → {object}",
@@ -1162,13 +1190,13 @@ def tool_kg_timeline(entity: str = None):
             entity = sanitize_kg_value(entity, "entity")
         except ValueError as e:
             return {"error": str(e)}
-    results = _kg.timeline(entity)
+    results = _get_kg().timeline(entity)
     return {"entity": entity or "all", "timeline": results, "count": len(results)}
 
 
 def tool_kg_stats():
     """Knowledge graph overview: entities, triples, relationship types."""
-    return _kg.stats()
+    return _get_kg().stats()
 
 
 # ==================== AGENT DIARY ====================

From 9e730098e97c173476b9949a5eca2253e56a08a5 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:48:00 +0500
Subject: [PATCH 17/60] test(mcp): migrate _kg monkeypatches to _get_kg (#1136)

Direct module-attribute patching of _kg is obsolete after the lazy
cache refactor. Switch test helpers to patch _get_kg instead so the
fixture KG replaces the factory rather than a now-missing singleton.

- tests/test_mcp_server.py: _patch_mcp_server helper
- tests/benchmarks/test_mcp_bench.py: _patch_mcp_config helper
- tests/benchmarks/test_memory_profile.py: inline patch in test_tool_status_repeated_calls
---
 tests/benchmarks/test_mcp_bench.py      | 3 ++-
 tests/benchmarks/test_memory_profile.py | 3 ++-
 tests/test_mcp_server.py                | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/benchmarks/test_mcp_bench.py b/tests/benchmarks/test_mcp_bench.py
index 4e8330b..42e73ec 100644
--- a/tests/benchmarks/test_mcp_bench.py
+++ b/tests/benchmarks/test_mcp_bench.py
@@ -40,8 +40,9 @@ def _patch_mcp_config(monkeypatch, palace_path, tmp_path):
 
     import mempalace.mcp_server as mcp_mod
 
+    kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
     monkeypatch.setattr(mcp_mod, "_config", cfg)
-    monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+    monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)
 
 
 def _get_rss_mb():
diff --git a/tests/benchmarks/test_memory_profile.py b/tests/benchmarks/test_memory_profile.py
index b299b2d..047bfaa 100644
--- a/tests/benchmarks/test_memory_profile.py
+++ b/tests/benchmarks/test_memory_profile.py
@@ -84,8 +84,9 @@ class TestToolStatusMemoryProfile:
 
         cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))
         monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})
+        kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
         monkeypatch.setattr(mcp_mod, "_config", cfg)
-        monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+        monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)
 
         from mempalace.mcp_server import tool_status
 
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index ec9562b..2ab2eb8 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -18,7 +18,7 @@ def _patch_mcp_server(monkeypatch, config, kg):
     from mempalace import mcp_server
 
     monkeypatch.setattr(mcp_server, "_config", config)
-    monkeypatch.setattr(mcp_server, "_kg", kg)
+    monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)
 
 
 def _get_collection(palace_path, create=False):

From c69a622a18db0a082bce8729d35c1d6a8d98efdf Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:53:19 +0500
Subject: [PATCH 18/60] test(mcp): add multi-tenant and lazy-init tests for KG
 (#1136)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TestKGLazyCache covers the scenarios behind the lazy per-path refactor:

- test_lazy_init_no_import_side_effect: a fresh subprocess import does
  not create ~/.mempalace/knowledge_graph.sqlite3 (what closed PR #167
  was aiming at).
- test_get_kg_returns_same_instance: two _get_kg() calls under the same
  resolved path return the same object, cache has one entry.
- test_get_kg_different_paths_different_instances: rotating env var
  produces distinct KGs.
- test_multi_tenant_env_switch: the exact scenario from #1136 — write
  under path A, query under path B returns empty, switching back to A
  sees the fact.
- test_cache_thread_safe: 16 threads racing _get_kg() end up with one
  shared instance and one cache entry.
---
 tests/test_mcp_server.py | 112 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 2ab2eb8..86d5878 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -8,6 +8,7 @@ via monkeypatch to avoid touching real data.
 
 from datetime import datetime
 import json
+import os
 import sys
 
 import pytest
@@ -1143,3 +1144,114 @@ class TestCacheInvalidation:
         for kwargs in captured["get"]:
             assert "embedding_function" in kwargs
             assert kwargs["embedding_function"] is not None
+
+
+class TestKGLazyCache:
+    """Lazy per-path KnowledgeGraph cache (issue #1136)."""
+
+    def test_lazy_init_no_import_side_effect(self, tmp_path):
+        """Importing mcp_server must not create knowledge_graph.sqlite3.
+
+        Runs in a fresh subprocess with HOME pointed at tmp_path so the
+        assertion targets a clean filesystem, independent of conftest's
+        session-level HOME patch.
+        """
+        import subprocess
+        import sys
+
+        kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        result = subprocess.run(
+            [sys.executable, "-c", "import mempalace.mcp_server"],
+            env={
+                "HOME": str(tmp_path),
+                "USERPROFILE": str(tmp_path),
+                "PATH": os.environ.get("PATH", ""),
+                "PYTHONPATH": os.environ.get("PYTHONPATH", ""),
+            },
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"import failed: {result.stderr}"
+        assert not kg_file.exists(), f"import created sqlite file at {kg_file} as a side effect"
+
+    def test_get_kg_returns_same_instance(self, tmp_path, monkeypatch):
+        """Two calls with the same resolved path return the same KG."""
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        kg1 = mcp_server._get_kg()
+        kg2 = mcp_server._get_kg()
+        assert kg1 is kg2
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_get_kg_different_paths_different_instances(self, tmp_path, monkeypatch):
+        """Different palace paths map to different KG instances."""
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "a"
+        tmp_b = tmp_path / "b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        kg_a = mcp_server._get_kg()
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        kg_b = mcp_server._get_kg()
+
+        assert kg_a is not kg_b
+        assert len(mcp_server._kg_by_path) == 2
+
+    def test_multi_tenant_env_switch(self, tmp_path, monkeypatch):
+        """The issue #1136 acceptance scenario.
+
+        Rotating MEMPALACE_PALACE_PATH between MCP tool calls must route
+        each call to the correct tenant's KG sqlite file.
+        """
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "tenant_a"
+        tmp_b = tmp_path / "tenant_b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        add_result = mcp_server.tool_kg_add(
+            subject="alice_secret",
+            predicate="owns",
+            object="repo_a",
+        )
+        assert add_result.get("success") is True, add_result
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        query_b = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_b.get("count", 0) == 0, f"tenant B leaked tenant A's fact: {query_b}"
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        query_a = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_a.get("count", 0) >= 1, f"tenant A lost its own fact: {query_a}"
+
+    def test_cache_thread_safe(self, tmp_path, monkeypatch):
+        """Concurrent _get_kg() for the same path yields one instance."""
+        import concurrent.futures
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as pool:
+            results = list(pool.map(lambda _: mcp_server._get_kg(), range(16)))
+
+        ids = {id(kg) for kg in results}
+        assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
+        assert len(mcp_server._kg_by_path) == 1

From 84f9726a39e65be3f8afba0d14d4183999520c0d Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 13:03:12 +0500
Subject: [PATCH 19/60] test(mcp): fix Windows subprocess env in KG lazy-init
 test

Passing a stripped env dict without SYSTEMROOT/WINDIR breaks Python
bootstrap on Windows (_Py_HashRandomization_Init). Inherit the parent
env and strip MEMPAL* vars instead, then override HOME/USERPROFILE to
the tmp dir.
---
 tests/test_mcp_server.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 86d5878..638ac15 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1160,14 +1160,12 @@ class TestKGLazyCache:
         import sys
 
         kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        env = {k: v for k, v in os.environ.items() if not k.startswith("MEMPAL")}
+        env["HOME"] = str(tmp_path)
+        env["USERPROFILE"] = str(tmp_path)
         result = subprocess.run(
             [sys.executable, "-c", "import mempalace.mcp_server"],
-            env={
-                "HOME": str(tmp_path),
-                "USERPROFILE": str(tmp_path),
-                "PATH": os.environ.get("PATH", ""),
-                "PYTHONPATH": os.environ.get("PYTHONPATH", ""),
-            },
+            env=env,
             capture_output=True,
             text=True,
             timeout=30,

From 19f8a4ff682fe5fdad71c6c5d31a45fac6ce2310 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 13:07:34 +0500
Subject: [PATCH 20/60] style(mcp): drop issue-tracker comments from KG cache
 block

Inline comments referencing #1136 and #540 add no information the
identifiers do not already convey. PR description carries the context;
code stays quiet.
---
 mempalace/mcp_server.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index eae048b..5f74df6 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -105,18 +105,8 @@ if _args.palace:
 
 _config = MempalaceConfig()
 
-# Lazy per-path KG cache. Import no longer creates the sqlite file as a side
-# effect (see issue #1136). The path is resolved on each tool call so that a
-# multi-tenant host rotating MEMPALACE_PALACE_PATH between calls routes each
-# call to the correct KG file, matching the per-call behavior of _get_client()
-# on the ChromaDB side.
 _kg_by_path: dict[str, KnowledgeGraph] = {}
 _kg_cache_lock = threading.Lock()
-
-# Whether --palace was given at startup. Controls default-path resolution:
-# with the flag, KG follows _config.palace_path per call; without it, KG stays
-# on DEFAULT_KG_PATH regardless of env var (issue #540's territory, out of
-# scope here).
 _palace_flag_given: bool = bool(_args.palace)
 
 
From 0a626580513c09bd3a9f7719e53dc3e4810463f5 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 2 May 2026 18:00:36 +0500
Subject: [PATCH 21/60] fix(mcp): drain KG cache on tool_reconnect

tool_reconnect cleared ChromaDB caches but left _kg_by_path entries
intact. After an external replacement of knowledge_graph.sqlite3 the
server kept serving the old open sqlite3.Connection, returning stale
results.

Now iterate _kg_by_path under _kg_cache_lock, call close() best-effort,
and clear the dict so the next tool call reopens the KG from disk.
Two new tests in TestKGLazyCache verify cache invalidation and that a
failing close() does not block the clear.
---
 mempalace/mcp_server.py  | 14 ++++++++++++--
 tests/test_mcp_server.py | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 5f74df6..3b3b4e2 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1418,10 +1418,11 @@ def tool_memories_filed_away():
 
 
 def tool_reconnect():
-    """Force the MCP server to drop the cached ChromaDB collection and reconnect.
+    """Force the MCP server to drop cached ChromaDB + KnowledgeGraph state.
 
     Use after external scripts or CLI commands modify the palace database
-    directly, which can leave the in-memory HNSW index stale.
+    or replace ``knowledge_graph.sqlite3`` directly, which can leave the
+    in-memory HNSW index stale or pin a closed-on-disk SQLite connection.
     """
     global \
         _client_cache, \
@@ -1439,6 +1440,15 @@ def tool_reconnect():
     # still applies after the reconnect.
     _vector_disabled = False
     _vector_disabled_reason = ""
+    # Drain the per-path KnowledgeGraph cache so a replaced sqlite file is
+    # reopened on the next tool call rather than served from a stale handle.
+    with _kg_cache_lock:
+        for kg in _kg_by_path.values():
+            try:
+                kg.close()
+            except Exception:
+                pass
+        _kg_by_path.clear()
     try:
         col = _get_collection()
         if col is None:
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 638ac15..092b707 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1253,3 +1253,45 @@ class TestKGLazyCache:
         ids = {id(kg) for kg in results}
         assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
         assert len(mcp_server._kg_by_path) == 1
+
+    def test_tool_reconnect_drains_kg_cache(self, monkeypatch):
+        """``tool_reconnect`` must close cached KG instances and clear the dict.
+
+        Without this, an external replacement of ``knowledge_graph.sqlite3``
+        leaves the server pinned to a stale ``sqlite3.Connection``.
+        """
+        from mempalace import mcp_server
+
+        class _FakeKG:
+            def __init__(self):
+                self.closed = False
+
+            def close(self):
+                self.closed = True
+
+        fake_a = _FakeKG()
+        fake_b = _FakeKG()
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": fake_a, "/b": fake_b})
+        # Bypass real ChromaDB so the test isolates KG-cache behaviour.
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert fake_a.closed is True
+        assert fake_b.closed is True
+        assert mcp_server._kg_by_path == {}
+
+    def test_tool_reconnect_swallows_kg_close_errors(self, monkeypatch):
+        """A failing ``close()`` on one cached KG must not block cache clearing."""
+        from mempalace import mcp_server
+
+        class _BoomKG:
+            def close(self):
+                raise RuntimeError("boom")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": _BoomKG()})
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert mcp_server._kg_by_path == {}

From 45df1a265748200f709a1d737bfb8003061cbb32 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Wed, 22 Apr 2026 15:59:43 +0500
Subject: [PATCH 22/60] fix(backends/chroma): release SQLite file lock on
 close_palace/close (#1067)

ChromaBackend.close_palace() and close() evicted cached PersistentClients
from self._clients without calling client.close(), so chromadb 1.5.x kept
the rust-side SQLite file lock until GC. Reopening the same palace path
after shutil.rmtree + re-create within one process then failed with
SQLITE_READONLY_DBMOVED (SQLite code 1032).

Add _close_client() helper with a try/except fallback for older chromadb,
and route close_palace(), close(), and the DB-file-missing invalidation
branch of _client() through it. The mtime/inode auto-invalidation branch
is left as-is: callers there may still hold a live ChromaCollection
handle, and closing out from under them clears the rust bindings mid-use.

Regression tests cover close_palace reopen-same-path and whole-backend
close for multiple palaces.
---
 mempalace/backends/chroma.py | 28 ++++++++++++++++++---
 tests/test_backends.py       | 47 ++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index d9b99a4..e7c2e6f 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -676,6 +676,20 @@ def _as_list(v: Any) -> list:
     return [v]
 
 
+def _close_client(client) -> None:
+    """Call ``PersistentClient.close()`` if available, swallow otherwise.
+
+    chromadb 1.5.x exposes ``Client.close()`` to release rust-side SQLite
+    file locks; older versions relied on GC. Try/except keeps forward-compat.
+    """
+    if client is None:
+        return
+    try:
+        client.close()
+    except Exception:
+        logger.debug("client.close() unavailable or failed", exc_info=True)
+
+
 class ChromaCollection(BaseCollection):
     """Thin adapter translating ChromaDB dict returns into typed results."""
 
@@ -977,7 +991,7 @@ class ChromaBackend(BaseBackend):
         db_path = os.path.join(palace_path, "chroma.sqlite3")
         # DB was present when cache was built but is now missing → invalidate.
         if cached is not None and not os.path.isfile(db_path):
-            self._clients.pop(palace_path, None)
+            _close_client(self._clients.pop(palace_path, None))
             self._freshness.pop(palace_path, None)
             cached = None
             cached_inode, cached_mtime = 0, 0.0
@@ -1134,14 +1148,22 @@ class ChromaBackend(BaseBackend):
         return ChromaCollection(collection)
 
     def close_palace(self, palace) -> None:
-        """Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
+        """Drop cached handles for ``palace`` and release its SQLite file lock.
+
+        Accepts ``PalaceRef`` or legacy path str. chromadb's rust-side file
+        lock is held until ``PersistentClient.close()`` is called, so plain
+        dict eviction would leave the palace path unreopenable and
+        unremovable in the same process.
+        """
         path = palace.local_path if isinstance(palace, PalaceRef) else palace
         if path is None:
             return
-        self._clients.pop(path, None)
+        _close_client(self._clients.pop(path, None))
         self._freshness.pop(path, None)
 
     def close(self) -> None:
+        for client in self._clients.values():
+            _close_client(client)
         self._clients.clear()
         self._freshness.clear()
         self._closed = True
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 4ddfe12..06998c5 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import sqlite3
 from pathlib import Path
 
@@ -206,6 +207,52 @@ def test_query_empty_preserves_embeddings_outer_shape_when_requested():
     assert not_requested.embeddings is None
 
 
+def test_chroma_close_palace_releases_sqlite_lock_for_reopen(tmp_path):
+    """close_palace must release chromadb's rust-side SQLite file lock so
+    a fresh PersistentClient on the same path after shutil.rmtree can
+    write without hitting SQLITE_READONLY_DBMOVED."""
+    backend = ChromaBackend()
+    palace_path = tmp_path / "palace-a"
+    ref = PalaceRef(id=str(palace_path), local_path=str(palace_path))
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["hello"], ids=["a"], metadatas=[{"k": "v"}])
+
+    backend.close_palace(ref)
+    shutil.rmtree(palace_path)
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["world"], ids=["b"], metadatas=[{"k": "v2"}])
+    assert col.count() == 1
+
+
+def test_chroma_close_releases_all_cached_clients(tmp_path):
+    """close() must release every cached client's SQLite file lock so any
+    of their palace paths can be reopened by a fresh backend in the same
+    process."""
+    backend = ChromaBackend()
+    palace_a = tmp_path / "palace-a"
+    palace_b = tmp_path / "palace-b"
+    ref_a = PalaceRef(id=str(palace_a), local_path=str(palace_a))
+    ref_b = PalaceRef(id=str(palace_b), local_path=str(palace_b))
+
+    for ref in (ref_a, ref_b):
+        backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True).upsert(
+            documents=["x"], ids=["x"], metadatas=[{"k": "v"}]
+        )
+
+    backend.close()
+
+    for path in (palace_a, palace_b):
+        shutil.rmtree(path)
+        ref = PalaceRef(id=str(path), local_path=str(path))
+        fresh = ChromaBackend()
+        col = fresh.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+        col.upsert(documents=["y"], ids=["y"], metadatas=[{"k": "v2"}])
+        assert col.count() == 1
+        fresh.close()
+
+
 def test_chroma_cache_invalidates_when_db_file_missing(tmp_path):
     """A palace rebuild that removes chroma.sqlite3 must drop the stale cache.
 

From 7cee74c8c8c6c31acb8b363788443612e36dac77 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Thu, 30 Apr 2026 14:49:02 +0500
Subject: [PATCH 23/60] fix(fact-checker): reconfigure stdio to UTF-8 on
 Windows

The `python -m mempalace.fact_checker --stdin` entry point reads non-ASCII
text through the system ANSI codepage (cp1252/cp1251/cp950) on Windows,
which mojibakes characters before claim-extraction sees them. Reconfigure
stdin/stdout/stderr to UTF-8 with `errors="strict"`, wrapped in try/except
so a replaced stream (Jupyter, test harness) logs a warning rather than
crashing the CLI.

Mirrors the same fix shipped for `mcp_server.py:main()` (#400) and
`hooks_cli.py:run_hook()` (#1280) -- this is the third and last
stdin-reading entry point in the package.
---
 mempalace/fact_checker.py  | 27 +++++++++++++++++
 tests/test_fact_checker.py | 60 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 50e8842..c894859 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -303,11 +303,38 @@ def _edit_distance(s1: str, s2: str) -> int:
     return prev[-1]
 
 
+def _reconfigure_stdio_utf8_on_windows():
+    """Decode --stdin payload as UTF-8 on Windows.
+
+    Without this, Python defaults stdio to the system ANSI codepage
+    (cp1252/cp1251/cp950 depending on locale), which mojibakes
+    non-ASCII fact text before pattern parsing sees it.
+    """
+    import sys
+
+    if sys.platform != "win32":
+        return
+    for name in ("stdin", "stdout", "stderr"):
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors="strict")
+        except Exception as exc:
+            print(
+                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                file=sys.stderr,
+            )
+
+
 if __name__ == "__main__":
     import argparse
     import json
     import sys
 
+    _reconfigure_stdio_utf8_on_windows()
+
     parser = argparse.ArgumentParser(
         description="Check text against known facts in the MemPalace palace.",
         epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py
index 5b34a40..9db370e 100644
--- a/tests/test_fact_checker.py
+++ b/tests/test_fact_checker.py
@@ -286,3 +286,63 @@ class TestCLI:
         assert "similar_name" in out
         # Silence unused import warning.
         _ = (MagicMock, patch, fact_checker)
+
+    def test_reconfigures_stdio_to_utf8_on_windows(self):
+        """Windows fact_checker --stdin must decode payload as UTF-8.
+
+        Without this, Python defaults stdio to the system ANSI codepage
+        (cp1252/cp1251/cp950), which mojibakes non-ASCII text before
+        pattern parsing sees it.
+        """
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self, initial_value=""):
+                super().__init__(initial_value)
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        stdout = _ReconfigurableStringIO()
+        stderr = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "win32"),
+            patch.object(sys, "stdin", stdin),
+            patch.object(sys, "stdout", stdout),
+            patch.object(sys, "stderr", stderr),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        expected = {"encoding": "utf-8", "errors": "strict"}
+        assert stdin.reconfigure_calls == [expected]
+        assert stdout.reconfigure_calls == [expected]
+        assert stderr.reconfigure_calls == [expected]
+
+    def test_reconfigure_stdio_is_noop_off_windows(self):
+        """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self):
+                super().__init__()
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "linux"),
+            patch.object(sys, "stdin", stdin),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        assert stdin.reconfigure_calls == []

From 32f4dfa26d25b8ff243bfd2e636f5e96d8947a83 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Thu, 30 Apr 2026 15:00:37 +0500
Subject: [PATCH 24/60] fix(cli): reconfigure stdio to UTF-8 on Windows

The primary `mempalace` console_script (`cli.py:main()`) reads non-ASCII
arguments via piped stdin and writes verbatim drawer text / wing names
through `print()`. On Windows, Python defaults stdio to the system ANSI
codepage (cp1252/cp1251/cp950), so:

- `mempalace search "..." > out.txt` mojibakes any drawer text containing
  non-Latin characters
- `mempalace ... < input.txt` mojibakes piped non-ASCII input

Reconfigure stdin/stdout/stderr to UTF-8 (`errors="strict"`) at the top
of `main()`, mirroring the helper added in this PR for fact_checker's
`__main__` block. Wrapped in try/except so a replaced stream (Jupyter,
test harness) logs a warning and continues rather than crashing the CLI.

The reconfigure cascades through every `mempalace` subcommand
(`init`/`mine`/`search`/`status`/`hook`/etc.) and through the interactive
flows that read non-ASCII names via `input()` (onboarding, entity
detector, room detector). With this commit the package's three
user-facing entry points (`mempalace`, `mempalace-mcp`, and
`python -m mempalace.fact_checker`) all reconfigure stdio identically on
Windows.
---
 mempalace/cli.py  | 27 ++++++++++++++++++++++++
 tests/test_cli.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index f2606a4..7372cd7 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -935,7 +935,34 @@ def cmd_compress(args):
         print("  (dry run -- nothing stored)")
 
 
+def _reconfigure_stdio_utf8_on_windows():
+    """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
+
+    Without this, Python defaults stdio to the system ANSI codepage
+    (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII
+    content piped in (`mempalace search ... < query.txt`) or piped out
+    (`mempalace search "..." > out.txt`) when verbatim drawer text or
+    wing/room names contain non-Latin characters.
+    """
+    if sys.platform != "win32":
+        return
+    for name in ("stdin", "stdout", "stderr"):
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors="strict")
+        except Exception as exc:
+            print(
+                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                file=sys.stderr,
+            )
+
+
 def main():
+    _reconfigure_stdio_utf8_on_windows()
+
     version_label = f"MemPalace {__version__}"
     parser = argparse.ArgumentParser(
         description="MemPalace — Give your AI a memory. No API key required.",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 328b90c..4836d69 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1042,3 +1042,55 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
     palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
     backup_path = palace_path + ".backup"
     assert not backup_path.startswith(palace_path + os.sep)
+
+
+# ── stdio reconfigure on Windows ─────────────────────────────────────
+
+
+class _ReconfigurableStringIO:
+    def __init__(self):
+        self.reconfigure_calls = []
+
+    def reconfigure(self, **kwargs):
+        self.reconfigure_calls.append(kwargs)
+
+
+def test_reconfigures_stdio_to_utf8_on_windows():
+    """Windows `mempalace` CLI must decode/encode stdio as UTF-8.
+
+    Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
+    or piped non-ASCII output (`mempalace search "..." > out.txt`) is
+    mojibaked through the system ANSI codepage on non-Latin Windows
+    locales (cp1252/cp1251/cp950).
+    """
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    stdout = _ReconfigurableStringIO()
+    stderr = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "win32"),
+        patch.object(sys, "stdin", stdin),
+        patch.object(sys, "stdout", stdout),
+        patch.object(sys, "stderr", stderr),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    expected = {"encoding": "utf-8", "errors": "strict"}
+    assert stdin.reconfigure_calls == [expected]
+    assert stdout.reconfigure_calls == [expected]
+    assert stderr.reconfigure_calls == [expected]
+
+
+def test_reconfigure_stdio_is_noop_off_windows():
+    """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "linux"),
+        patch.object(sys, "stdin", stdin),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    assert stdin.reconfigure_calls == []

From 03643eb507e4ba81c65d50b519fcfb4dfb3c769f Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 21:37:12 +0500
Subject: [PATCH 25/60] fix(cli, fact-checker): per-stream stdio errors policy
 on Windows

Previously all three streams reconfigured to UTF-8 with errors='strict'.
That kills 'mempalace search' the moment a drawer carrying a surrogate
half (round-tripped from a filename via surrogateescape) hits print(),
losing the rest of the result block. Same hazard for warning lines on
stderr.

Split the policy:
  stdin  -> surrogateescape (malformed bytes from a redirected file
            survive as lone surrogates instead of crashing the read)
  stdout -> replace (drawer text with a stray surrogate becomes U+FFFD
            instead of UnicodeEncodeError mid-print)
  stderr -> replace (same protection for logger / warning paths)

Applied identically in the cli.py and fact_checker.py helpers; the DRY
extraction into a shared module is a separate cleanup ask, kept out of
this fix to keep the diff narrow.

Tests updated for the new per-stream assertion.
---
 mempalace/cli.py           | 20 ++++++++++++++++++--
 mempalace/fact_checker.py  | 18 ++++++++++++++++--
 tests/test_cli.py          | 11 +++++++----
 tests/test_fact_checker.py | 11 +++++++----
 4 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 7372cd7..7052e1f 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows():
     content piped in (`mempalace search ... < query.txt`) or piped out
     (`mempalace search "..." > out.txt`) when verbatim drawer text or
     wing/room names contain non-Latin characters.
+
+    Per-stream errors policy:
+      stdin  -- surrogateescape: malformed bytes from a redirected file
+                survive as lone surrogates instead of crashing the read.
+      stdout -- replace: ``mempalace search`` prints verbatim drawer
+                text. A drawer that round-tripped a filename through
+                surrogateescape can hold a lone surrogate, which would
+                otherwise raise ``UnicodeEncodeError`` mid-print and
+                lose the rest of the search result block.
+      stderr -- replace: same hazard for logger output that quotes
+                user-supplied path or content.
     """
     if sys.platform != "win32":
         return
-    for name in ("stdin", "stdout", "stderr"):
+    policies = (
+        ("stdin", "surrogateescape"),
+        ("stdout", "replace"),
+        ("stderr", "replace"),
+    )
+    for name, errors in policies:
         stream = getattr(sys, name, None)
         reconfigure = getattr(stream, "reconfigure", None)
         if reconfigure is None:
             continue
         try:
-            reconfigure(encoding="utf-8", errors="strict")
+            reconfigure(encoding="utf-8", errors=errors)
         except Exception as exc:
             print(
                 f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index c894859..1844c45 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows():
     Without this, Python defaults stdio to the system ANSI codepage
     (cp1252/cp1251/cp950 depending on locale), which mojibakes
     non-ASCII fact text before pattern parsing sees it.
+
+    Per-stream errors policy mirrors the primary CLI helper in
+    ``mempalace/cli.py``:
+      stdin  -- surrogateescape: malformed input bytes survive as lone
+                surrogates instead of crashing the read.
+      stdout -- replace: extracted fact text can include surrogate
+                halves round-tripped from filenames; replace prevents
+                a UnicodeEncodeError mid-print.
+      stderr -- replace: same protection for warning lines.
     """
     import sys
 
     if sys.platform != "win32":
         return
-    for name in ("stdin", "stdout", "stderr"):
+    policies = (
+        ("stdin", "surrogateescape"),
+        ("stdout", "replace"),
+        ("stderr", "replace"),
+    )
+    for name, errors in policies:
         stream = getattr(sys, name, None)
         reconfigure = getattr(stream, "reconfigure", None)
         if reconfigure is None:
             continue
         try:
-            reconfigure(encoding="utf-8", errors="strict")
+            reconfigure(encoding="utf-8", errors=errors)
         except Exception as exc:
             print(
                 f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4836d69..6b4b7b3 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows():
     ):
         _reconfigure_stdio_utf8_on_windows()
 
-    expected = {"encoding": "utf-8", "errors": "strict"}
-    assert stdin.reconfigure_calls == [expected]
-    assert stdout.reconfigure_calls == [expected]
-    assert stderr.reconfigure_calls == [expected]
+    # Per-stream errors policy: stdin survives bad bytes via
+    # surrogateescape so a redirected non-UTF-8 file does not crash
+    # the read; stdout/stderr use replace so a drawer carrying a
+    # round-tripped surrogate half does not crash mid-print.
+    assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+    assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+    assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
 
 
 def test_reconfigure_stdio_is_noop_off_windows():
diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py
index 9db370e..89d8366 100644
--- a/tests/test_fact_checker.py
+++ b/tests/test_fact_checker.py
@@ -318,10 +318,13 @@ class TestCLI:
         ):
             _reconfigure_stdio_utf8_on_windows()
 
-        expected = {"encoding": "utf-8", "errors": "strict"}
-        assert stdin.reconfigure_calls == [expected]
-        assert stdout.reconfigure_calls == [expected]
-        assert stderr.reconfigure_calls == [expected]
+        # Per-stream errors policy: stdin uses surrogateescape so a stray
+        # malformed byte from a redirected file does not crash the read,
+        # stdout/stderr use replace so an extracted fact carrying a
+        # surrogate half does not crash mid-print.
+        assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+        assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+        assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
 
     def test_reconfigure_stdio_is_noop_off_windows(self):
         """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""

From b8816e0fe2fa857efb984e0aff9e52739fb2af34 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 21:43:51 +0500
Subject: [PATCH 26/60] fix(mcp): retry KG handlers once on concurrent close
 race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Race scenario: a KG tool handler calls _get_kg() and gets a live
KnowledgeGraph; another thread fires tool_reconnect() between that
return and the handler's kg.add_triple()/kg.query_entity()/etc call.
tool_reconnect drains _kg_by_path and closes the underlying
sqlite3.Connection; the handler then raises sqlite3.ProgrammingError:
'Cannot operate on a closed database', which surfaces as a -32000
to the MCP client even though the user just asked for a reconnect.

New _call_kg(op) helper wraps each handler's kg call in a one-shot
retry: catch exactly sqlite3.ProgrammingError, evict the stale entry
(only if the cache slot still points at the closed instance — another
thread may have already replaced it), and rerun op against a fresh
_get_kg(). Beyond one retry give up so a sustained close-stream
surfaces clearly instead of looping.

All five KG handlers (tool_kg_query, tool_kg_add, tool_kg_invalidate,
tool_kg_timeline, tool_kg_stats) now route through _call_kg.

Tests pin the contract:
  * retries with a fresh KG and returns the second result
  * non-ProgrammingError exceptions propagate without retry
  * gives up after exactly one retry on sustained close
---
 mempalace/mcp_server.py  | 61 ++++++++++++++++++++++++-------
 tests/test_mcp_server.py | 77 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 13 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 3b3b4e2..c67619e 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -46,6 +46,7 @@ import argparse  # noqa: E402  (deferred until after stdio protection above)
 import json  # noqa: E402
 import logging  # noqa: E402
 import hashlib  # noqa: E402
+import sqlite3  # noqa: E402
 import threading  # noqa: E402
 import time  # noqa: E402
 from datetime import date, datetime  # noqa: E402
@@ -129,6 +130,38 @@ def _get_kg() -> KnowledgeGraph:
     return kg
 
 
+def _call_kg(op):
+    """Run ``op(kg)`` against the cached KG with one-shot retry on close.
+
+    Race we're guarding against: a handler grabs ``kg = _get_kg()`` and is
+    about to call ``kg.add_triple(...)`` when ``tool_reconnect`` fires on
+    another thread, drains ``_kg_by_path``, and closes the underlying
+    sqlite3.Connection. The handler's call then raises
+    ``sqlite3.ProgrammingError: Cannot operate on a closed database`` and
+    bubbles up as a -32000 to the MCP client even though the user just
+    asked for a reconnect.
+
+    Catch that single class of error, evict the stale entry from the
+    cache (only if it still points at the closed instance — another
+    thread may have already replaced it), and try once more with a fresh
+    KG. Beyond one retry give up: a second close means we're losing a
+    sustained race we won't win in this loop, and a hung loop is worse
+    than a clear failure surface.
+    """
+    for attempt in range(2):
+        kg = _get_kg()
+        try:
+            return op(kg)
+        except sqlite3.ProgrammingError:
+            if attempt == 0:
+                path = os.path.abspath(_resolve_kg_path())
+                with _kg_cache_lock:
+                    if _kg_by_path.get(path) is kg:
+                        _kg_by_path.pop(path, None)
+                continue
+            raise
+
+
 _client_cache = None
 _collection_cache = None
 _palace_db_inode = 0  # inode of chroma.sqlite3 at cache time
@@ -1081,7 +1114,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
         return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
-    results = _get_kg().query_entity(entity, as_of=as_of, direction=direction)
+    results = _call_kg(lambda kg: kg.query_entity(entity, as_of=as_of, direction=direction))
     return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
 
 
@@ -1126,15 +1159,17 @@ def tool_kg_add(
             "source_drawer_id": source_drawer_id,
         },
     )
-    triple_id = _get_kg().add_triple(
-        subject,
-        predicate,
-        object,
-        valid_from=valid_from,
-        valid_to=valid_to,
-        source_closet=source_closet,
-        source_file=source_file,
-        source_drawer_id=source_drawer_id,
+    triple_id = _call_kg(
+        lambda kg: kg.add_triple(
+            subject,
+            predicate,
+            object,
+            valid_from=valid_from,
+            valid_to=valid_to,
+            source_closet=source_closet,
+            source_file=source_file,
+            source_drawer_id=source_drawer_id,
+        )
     )
     return {"success": True, "triple_id": triple_id, "fact": f"{subject} → {predicate} → {object}"}
 
@@ -1165,7 +1200,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
             "ended": resolved_ended,
         },
     )
-    _get_kg().invalidate(subject, predicate, object, ended=resolved_ended)
+    _call_kg(lambda kg: kg.invalidate(subject, predicate, object, ended=resolved_ended))
     return {
         "success": True,
         "fact": f"{subject} → {predicate} → {object}",
@@ -1180,13 +1215,13 @@ def tool_kg_timeline(entity: str = None):
             entity = sanitize_kg_value(entity, "entity")
         except ValueError as e:
             return {"error": str(e)}
-    results = _get_kg().timeline(entity)
+    results = _call_kg(lambda kg: kg.timeline(entity))
     return {"entity": entity or "all", "timeline": results, "count": len(results)}
 
 
 def tool_kg_stats():
     """Knowledge graph overview: entities, triples, relationship types."""
-    return _get_kg().stats()
+    return _call_kg(lambda kg: kg.stats())
 
 
 # ==================== AGENT DIARY ====================
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 092b707..e365afc 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1295,3 +1295,80 @@ class TestKGLazyCache:
         mcp_server.tool_reconnect()
 
         assert mcp_server._kg_by_path == {}
+
+    def test_call_kg_retries_after_concurrent_close(self, monkeypatch):
+        """A KG closed mid-handler must trigger a one-shot retry with a fresh
+        instance — not surface a -32000 to the MCP client."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        class _ClosedKG:
+            def query_entity(self, entity, **kwargs):
+                raise _sqlite3.ProgrammingError("Cannot operate on a closed database")
+
+        class _FreshKG:
+            def query_entity(self, entity, **kwargs):
+                return [{"entity": entity}]
+
+        cache = {os.path.abspath(path): _ClosedKG()}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+
+        # Second _get_kg() call (after the cache eviction) constructs a new
+        # KG. Patch the constructor so we don't open a real sqlite file.
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FreshKG())
+
+        result = mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert result == [{"entity": "Alice"}]
+        # The closed instance must be evicted; the fresh one must be cached.
+        assert isinstance(cache[os.path.abspath(path)], _FreshKG)
+
+    def test_call_kg_does_not_retry_on_other_errors(self, monkeypatch):
+        """Non-ProgrammingError exceptions must propagate without retry —
+        we don't want the retry guard masking real bugs."""
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _FailingKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise ValueError("bad input")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {os.path.abspath(path): _FailingKG()})
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FailingKG())
+
+        with pytest.raises(ValueError, match="bad input"):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 1, "non-ProgrammingError must not trigger retry"
+
+    def test_call_kg_gives_up_after_one_retry(self, monkeypatch):
+        """If the second attempt also hits a closed DB, give up rather than
+        loop forever — a sustained close-stream is a different bug."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _AlwaysClosedKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise _sqlite3.ProgrammingError("closed again")
+
+        cache = {}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _AlwaysClosedKG())
+
+        with pytest.raises(_sqlite3.ProgrammingError):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 2, "expected exactly one retry beyond the initial attempt"

From 75ad8ae7819ee9eebdf635cc0c3e969b2f9bc73b Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 22:04:22 +0500
Subject: [PATCH 27/60] ci: retrigger linux 3.13 (transient onnx download
 flake)


From 285b3b4f2e387c1e8eda865569a2edc400f5c1f1 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 22:25:31 +0500
Subject: [PATCH 28/60] refactor(stdio): extract Windows UTF-8 reconfigure into
 shared helper

Both cli.py and fact_checker.py carried identical 28-line Windows stdio
reconfigure helpers; pull the loop into mempalace/_stdio.py so the same
machine drives the CLI, the fact_checker --stdin entry point, and the
MCP server. The thin per-call-site wrappers stay so existing tests keep
importing _reconfigure_stdio_utf8_on_windows from the same module they
always have.

CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash
on a malformed redirected file), stdout/stderr=replace (don't crash
mid-print on a surrogate half round-tripped from a filename).
---
 mempalace/_stdio.py       | 71 +++++++++++++++++++++++++++++++++++++++
 mempalace/cli.py          | 45 ++++++-------------------
 mempalace/fact_checker.py | 39 ++++-----------------
 3 files changed, 88 insertions(+), 67 deletions(-)
 create mode 100644 mempalace/_stdio.py

diff --git a/mempalace/_stdio.py b/mempalace/_stdio.py
new file mode 100644
index 0000000..13e9509
--- /dev/null
+++ b/mempalace/_stdio.py
@@ -0,0 +1,71 @@
+"""Stdio UTF-8 reconfiguration helper for Windows entry points.
+
+Python on Windows defaults stdio to the system ANSI codepage
+(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
+or output the moment a non-Latin character shows up. Every console
+entry point that touches stdio needs to fix this on Windows -- the MCP
+server, the CLI, the fact_checker `--stdin` mode -- so the
+reconfigure code lives here in one place to keep the per-stream
+errors policies aligned across them.
+
+Per-stream errors policy is caller-chosen:
+
+* MCP server uses ``strict`` on stdout/stderr because everything written
+  there is server-controlled JSON-RPC; any encode failure is a real bug
+  the operator wants loud.
+* CLI / fact_checker use ``replace`` on stdout/stderr because they print
+  verbatim drawer text that may contain surrogate halves round-tripped
+  from filenames -- ``strict`` would crash mid-print.
+* All callers use ``surrogateescape`` on stdin so a malformed byte from
+  a redirected file or a misbehaving client survives as a lone surrogate
+  the consumer's parser surfaces, instead of ``UnicodeDecodeError``
+  killing the read loop on the first bad byte.
+"""
+
+from __future__ import annotations
+
+import sys
+from typing import Callable, Optional
+
+
+def reconfigure_stdio_utf8_on_windows(
+    *,
+    stdin_errors: str = "surrogateescape",
+    stdout_errors: str = "strict",
+    stderr_errors: str = "strict",
+    on_failure: Optional[Callable[[str, BaseException], None]] = None,
+) -> None:
+    """Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
+
+    Args:
+        stdin_errors: errors= policy for stdin.reconfigure().
+        stdout_errors: errors= policy for stdout.reconfigure().
+        stderr_errors: errors= policy for stderr.reconfigure().
+        on_failure: optional ``(stream_name, exc) -> None`` callback for
+            streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
+            streams that lack the method-shape we expect). Defaults to a
+            ``WARNING:`` line on the original sys.stderr.
+    """
+    if sys.platform != "win32":
+        return
+
+    policies = (
+        ("stdin", stdin_errors),
+        ("stdout", stdout_errors),
+        ("stderr", stderr_errors),
+    )
+    for name, errors in policies:
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors=errors)
+        except Exception as exc:  # noqa: BLE001 -- last-resort guard
+            if on_failure is not None:
+                on_failure(name, exc)
+            else:
+                print(
+                    f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                    file=sys.stderr,
+                )
diff --git a/mempalace/cli.py b/mempalace/cli.py
index 7052e1f..0ab3d0f 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -938,42 +938,17 @@ def cmd_compress(args):
 def _reconfigure_stdio_utf8_on_windows():
     """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
 
-    Without this, Python defaults stdio to the system ANSI codepage
-    (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII
-    content piped in (`mempalace search ... < query.txt`) or piped out
-    (`mempalace search "..." > out.txt`) when verbatim drawer text or
-    wing/room names contain non-Latin characters.
-
-    Per-stream errors policy:
-      stdin  -- surrogateescape: malformed bytes from a redirected file
-                survive as lone surrogates instead of crashing the read.
-      stdout -- replace: ``mempalace search`` prints verbatim drawer
-                text. A drawer that round-tripped a filename through
-                surrogateescape can hold a lone surrogate, which would
-                otherwise raise ``UnicodeEncodeError`` mid-print and
-                lose the rest of the search result block.
-      stderr -- replace: same hazard for logger output that quotes
-                user-supplied path or content.
+    Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
+    overrides stdout/stderr to ``replace`` because ``mempalace search``
+    prints verbatim drawer text that may carry surrogate halves
+    round-tripped from filenames -- ``strict`` would crash mid-print and
+    lose the rest of the search result block. stdin keeps the default
+    ``surrogateescape`` so a redirected non-UTF-8 file does not kill the
+    read on the first bad byte.
     """
-    if sys.platform != "win32":
-        return
-    policies = (
-        ("stdin", "surrogateescape"),
-        ("stdout", "replace"),
-        ("stderr", "replace"),
-    )
-    for name, errors in policies:
-        stream = getattr(sys, name, None)
-        reconfigure = getattr(stream, "reconfigure", None)
-        if reconfigure is None:
-            continue
-        try:
-            reconfigure(encoding="utf-8", errors=errors)
-        except Exception as exc:
-            print(
-                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
-                file=sys.stderr,
-            )
+    from ._stdio import reconfigure_stdio_utf8_on_windows
+
+    reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
 
 
 def main():
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 1844c45..403d913 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int:
 def _reconfigure_stdio_utf8_on_windows():
     """Decode --stdin payload as UTF-8 on Windows.
 
-    Without this, Python defaults stdio to the system ANSI codepage
-    (cp1252/cp1251/cp950 depending on locale), which mojibakes
-    non-ASCII fact text before pattern parsing sees it.
-
-    Per-stream errors policy mirrors the primary CLI helper in
-    ``mempalace/cli.py``:
-      stdin  -- surrogateescape: malformed input bytes survive as lone
-                surrogates instead of crashing the read.
-      stdout -- replace: extracted fact text can include surrogate
-                halves round-tripped from filenames; replace prevents
-                a UnicodeEncodeError mid-print.
-      stderr -- replace: same protection for warning lines.
+    Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
+    the primary CLI policy: stdout/stderr use ``replace`` because
+    extracted fact text can include surrogate halves round-tripped from
+    filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
+    stdin keeps the default ``surrogateescape``.
     """
-    import sys
+    from ._stdio import reconfigure_stdio_utf8_on_windows
 
-    if sys.platform != "win32":
-        return
-    policies = (
-        ("stdin", "surrogateescape"),
-        ("stdout", "replace"),
-        ("stderr", "replace"),
-    )
-    for name, errors in policies:
-        stream = getattr(sys, name, None)
-        reconfigure = getattr(stream, "reconfigure", None)
-        if reconfigure is None:
-            continue
-        try:
-            reconfigure(encoding="utf-8", errors=errors)
-        except Exception as exc:
-            print(
-                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
-                file=sys.stderr,
-            )
+    reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
 
 
 if __name__ == "__main__":

From f9d939ae1bd5edba6901f0fd3872cff7ab30f790 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Mon, 4 May 2026 06:45:29 +0000
Subject: [PATCH 29/60] fix(storage): quarantine bloated HNSW link payloads

---
 mempalace/backends/chroma.py      | 147 +++++++++++++++++++-----------
 tests/test_hnsw_payload_health.py | 113 +++++++++++++++++++++++
 2 files changed, 205 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_hnsw_payload_health.py

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index d9b99a4..02684d2 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -29,6 +29,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
 _OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
 _SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
 
+# A healthy HNSW payload should keep link_lists.bin proportional to
+# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
+# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
+# index_metadata.pickle is structurally valid.
+#
+# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
+# Treat only >10x as corruption so normal flush lag or small segments do not get
+# quarantined.
+_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
+
+
+def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
+    """Return link_lists.bin / data_level0.bin size ratio for a segment.
+
+    ``None`` means the ratio is not meaningful, usually because one file is
+    missing or data_level0.bin is empty. ``float("inf")`` means the files were
+    present but could not be statted safely, which should be treated as
+    suspicious by callers.
+    """
+
+    link_path = os.path.join(seg_dir, "link_lists.bin")
+    data_path = os.path.join(seg_dir, "data_level0.bin")
+
+    if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
+        return None
+
+    try:
+        data_size = os.path.getsize(data_path)
+        link_size = os.path.getsize(link_path)
+    except OSError:
+        return float("inf")
+
+    if data_size <= 0:
+        return None
+
+    return link_size / data_size
+
+
+def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
+    """Return False when HNSW payload files are structurally implausible."""
+
+    ratio = _hnsw_link_to_data_ratio(seg_dir)
+    return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
+
+
 # HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
 #
 # With default params (batch_size=100, sync_threshold=1000, initial capacity
@@ -106,6 +151,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
     files and quarantine_stale_hnsw would conservatively rename them
     out of the way (lazy rebuild on next open recovers).
     """
+    if not _hnsw_payload_appears_sane(seg_dir):
+        return False
+
     meta_path = os.path.join(seg_dir, "index_metadata.pickle")
     if not os.path.isfile(meta_path):
         # No metadata file yet — segment hasn't flushed (fresh / empty).
@@ -127,64 +175,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
 
 
 def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
-    """Rename HNSW segment dirs that are both stale-by-mtime AND fail an
-    integrity sniff-test.
+    """Rename HNSW segment dirs that look unsafe to open.
 
-    Catches the segfault failure mode from #823 (semantic search stale
-    after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
-    ``count()`` with chromadb 1.5.5), and acknowledged as by-design at
-    chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
-    rebuild lazily on next open instead of segfaulting.
+    This catches two classes of HNSW corruption before ChromaDB opens the
+    native segment reader:
 
-    Two-stage check:
+    1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
+       existing format sniff-test;
+    2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
+       larger than ``data_level0.bin``.
 
-    1. **mtime gate.** If ``chroma.sqlite3`` is less than
-       ``stale_seconds`` newer than the segment's ``data_level0.bin``,
-       skip — chromadb is in normal write-path territory.
+    The second check is intentionally not gated by mtime. A segment with a
+    300x link/data ratio is unsafe regardless of whether its mtime is recent;
+    letting Chroma open it can SIGSEGV before Python fallback code runs.
 
-    2. **Integrity gate** (``_segment_appears_healthy``). Even when the
-       mtime gap exceeds the threshold, a segment whose
-       ``index_metadata.pickle`` passes a format sniff-test is healthy:
-       chromadb 1.5.x flushes HNSW state asynchronously and a clean
-       shutdown does NOT force-flush, so the on-disk HNSW is *always*
-       somewhat older than ``chroma.sqlite3``. Production observation
-       (2026-04-26 disks daemon): three of three segments quarantined
-       on every cold start, with 538-557s gaps, leaving the 151K-drawer
-       palace with vector_ranked=0 until rebuild. Renaming a healthy
-       segment based on mtime alone destroys a valid index — chromadb
-       creates an empty replacement, orphaning every drawer in sqlite
-       from vector recall until the operator runs ``mempalace repair
-       --mode rebuild`` (15+ min on a 151K palace).
-
-    Only segments that pass stage 1 (suspiciously stale) AND fail stage
-    2 (metadata file truncated, zero-filled, or absent-with-data) are
-    renamed to ``<uuid>.drift-<timestamp>``. The original directory is
-    renamed, not deleted, so recovery remains possible if the heuristic
-    misfires.
-
-    The default threshold (5 min) is advisory under daemon-strict; the
-    integrity gate is what actually distinguishes corruption from flush
-    lag. The threshold still matters for the cross-machine replication
-    case (#823), where it bounds how stale a Syncthing-replicated
-    segment can be before we look harder at it.
-
-    Args:
-        palace_path: path to the palace directory containing ``chroma.sqlite3``
-        stale_seconds: minimum mtime gap to *consider* a segment for quarantine
-
-    Returns:
-        List of paths that were quarantined (empty if nothing actually
-        looked corrupt).
+    The original directory is renamed, not deleted, so recovery remains
+    possible if the heuristic ever misfires.
     """
+
     db_path = os.path.join(palace_path, "chroma.sqlite3")
     if not os.path.isfile(db_path):
         return []
+
     try:
         sqlite_mtime = os.path.getmtime(db_path)
     except OSError:
         return []
 
     moved: list[str] = []
+
     try:
         entries = os.listdir(palace_path)
     except OSError:
@@ -193,29 +212,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
     for name in entries:
         if "-" not in name or name.startswith(".") or ".drift-" in name:
             continue
+
         seg_dir = os.path.join(palace_path, name)
         if not os.path.isdir(seg_dir):
             continue
+
         hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
         if not os.path.isfile(hnsw_bin):
             continue
+
         try:
             hnsw_mtime = os.path.getmtime(hnsw_bin)
         except OSError:
             continue
-        if sqlite_mtime - hnsw_mtime < stale_seconds:
+
+        payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
+        payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
+
+        if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
             continue
 
-        # Stage 2: integrity gate. mtime drift is necessary but not
-        # sufficient — chromadb's async flush makes drift the steady-
-        # state condition. A healthy segment metadata file proves
-        # chromadb can open the segment without segfault; don't
-        # quarantine a healthy index.
-        if _segment_appears_healthy(seg_dir):
+        # Stage 2: integrity gate. Mtime drift alone is not corruption because
+        # Chroma flushes HNSW asynchronously. A healthy metadata file proves the
+        # ordinary stale-by-mtime case is just flush lag.
+        if not payload_corrupt and _segment_appears_healthy(seg_dir):
             logger.info(
                 "HNSW mtime gap %.0fs on %s exceeds threshold but segment "
-                "metadata file is intact — flush-lag, not corruption. "
-                "Leaving in place.",
+                "metadata and payload size are intact — flush-lag, not "
+                "corruption. Leaving in place.",
                 sqlite_mtime - hnsw_mtime,
                 seg_dir,
             )
@@ -223,17 +247,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
 
         stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
         target = f"{seg_dir}.drift-{stamp}"
+
+        if payload_corrupt:
+            reason = (
+                f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
+                f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
+            )
+        else:
+            reason = (
+                f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
+                "and integrity check failed"
+            )
+
         try:
             os.rename(seg_dir, target)
             moved.append(target)
             logger.warning(
-                "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
+                "Quarantined corrupt HNSW segment %s (%s); renamed to %s",
                 seg_dir,
-                sqlite_mtime - hnsw_mtime,
+                reason,
                 target,
             )
         except OSError:
             logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
+
     return moved
 
 
diff --git a/tests/test_hnsw_payload_health.py b/tests/test_hnsw_payload_health.py
new file mode 100644
index 0000000..0af440a
--- /dev/null
+++ b/tests/test_hnsw_payload_health.py
@@ -0,0 +1,113 @@
+import os
+from pathlib import Path
+
+from mempalace.backends.chroma import (
+    _HNSW_LINK_TO_DATA_MAX_RATIO,
+    _hnsw_link_to_data_ratio,
+    _segment_appears_healthy,
+    quarantine_stale_hnsw,
+)
+
+
+def _write_segment(
+    seg_dir: Path,
+    *,
+    data_size: int = 100,
+    link_size: int = 100,
+    write_metadata: bool = True,
+) -> None:
+    seg_dir.mkdir(parents=True, exist_ok=True)
+    (seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
+    (seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
+
+    if write_metadata:
+        # Enough bytes to pass the existing pickle envelope sniff-test:
+        # starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
+        (seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
+
+
+def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(seg_dir, data_size=100, link_size=250)
+
+    assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
+
+
+def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    assert not _segment_appears_healthy(str(seg_dir))
+
+
+def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
+        write_metadata=True,
+    )
+
+    assert _segment_appears_healthy(str(seg_dir))
+
+
+def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    # Make sqlite and HNSW mtimes identical. The old mtime-only gate would
+    # skip this segment even though the payload is structurally corrupt.
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert len(moved) == 1
+    assert not seg_dir.exists()
+
+    moved_path = Path(moved[0])
+    assert moved_path.exists()
+    assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
+
+
+def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=100,
+        write_metadata=True,
+    )
+
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert moved == []
+    assert seg_dir.exists()

From 4f36145c2e51aa53c705c67a3f692174230bbcf9 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 13:01:55 +0200
Subject: [PATCH 30/60] fix(entity_registry): atomic write to prevent partial
 corruption on crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EntityRegistry.save() called Path.write_text() directly, which truncates
the target file and then writes — so a crash mid-write (power loss, OOM,
filesystem-full mid-flush) leaves an empty or half-written
entity_registry.json. The whole people/projects map is lost; the system
falls back to an empty registry on next load.

Switch to the standard atomic-write pattern: serialize to a sibling
.tmp file in the same directory (so os.replace stays on one filesystem),
fsync, chmod 0o600, then os.replace over the target. The replace is
atomic on POSIX and Windows, so any crash leaves the previous registry
intact instead of a truncated file.

Tests cover: no leftover .tmp on success, and previous content preserved
when os.replace itself raises mid-save.
---
 mempalace/entity_registry.py  | 15 ++++++++++--
 tests/test_entity_registry.py | 46 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py
index 78d8a8b..d77b6c1 100644
--- a/mempalace/entity_registry.py
+++ b/mempalace/entity_registry.py
@@ -16,6 +16,7 @@ Usage:
 """
 
 import json
+import os
 import re
 import urllib.request
 import urllib.parse
@@ -320,11 +321,21 @@ class EntityRegistry:
             self._path.parent.chmod(0o700)
         except (OSError, NotImplementedError):
             pass
-        self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
+        # Atomic write: serialize to a sibling temp file in the same dir
+        # (so os.replace stays on one filesystem), fsync, then rename over
+        # the target. A crash mid-write leaves the previous registry intact
+        # instead of a half-written file or an empty file from the truncate.
+        payload = json.dumps(self._data, indent=2)
+        tmp_path = self._path.with_name(self._path.name + ".tmp")
+        with open(tmp_path, "w", encoding="utf-8") as f:
+            f.write(payload)
+            f.flush()
+            os.fsync(f.fileno())
         try:
-            self._path.chmod(0o600)
+            tmp_path.chmod(0o600)
         except (OSError, NotImplementedError):
             pass
+        os.replace(tmp_path, self._path)
 
     @staticmethod
     def _empty() -> dict:
diff --git a/tests/test_entity_registry.py b/tests/test_entity_registry.py
index c857a07..a5f237c 100644
--- a/tests/test_entity_registry.py
+++ b/tests/test_entity_registry.py
@@ -2,6 +2,8 @@
 
 from unittest.mock import patch
 
+import pytest
+
 from mempalace.entity_registry import (
     COMMON_ENGLISH_WORDS,
     PERSON_CONTEXT_PATTERNS,
@@ -71,6 +73,50 @@ def test_save_creates_file(tmp_path):
     assert (tmp_path / "entity_registry.json").exists()
 
 
+def test_save_is_atomic_does_not_leave_tmp(tmp_path):
+    # Atomic write must not leave the .tmp sidecar file after a successful save.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.save()
+    leftover = list(tmp_path.glob("entity_registry.json.tmp*"))
+    assert leftover == [], f"atomic write leaked tmp file(s): {leftover}"
+
+
+def test_save_preserves_previous_on_serialization_failure(tmp_path, monkeypatch):
+    # If serialization fails mid-write, the previous registry must remain
+    # intact — this is the whole point of atomic write vs truncating in place.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.seed(
+        mode="personal",
+        people=[{"name": "Alice", "relationship": "friend", "context": "personal"}],
+        projects=[],
+    )
+    registry.save()
+    target = tmp_path / "entity_registry.json"
+    original = target.read_text(encoding="utf-8")
+
+    # Force os.replace to raise — simulates filesystem full / permission flip
+    # AFTER the temp file is written but BEFORE the rename completes.
+    import os as _os
+
+    real_replace = _os.replace
+
+    def boom(src, dst):
+        raise OSError("simulated rename failure")
+
+    monkeypatch.setattr(_os, "replace", boom)
+    with pytest.raises(OSError):
+        registry.seed(
+            mode="personal",
+            people=[{"name": "Bob", "relationship": "friend", "context": "personal"}],
+            projects=[],
+        )
+        registry.save()
+
+    # Restore os.replace before reading so the assertion can rely on it.
+    monkeypatch.setattr(_os, "replace", real_replace)
+    assert target.read_text(encoding="utf-8") == original
+
+
 # ── seed ────────────────────────────────────────────────────────────────
 
 
From 2e441d17a22f9e14dcb9d8576551ea867e56cda8 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Tue, 28 Apr 2026 09:43:27 +0200
Subject: [PATCH 31/60] fix(entity_registry): fsync parent dir after rename for
 ext4 durability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, on ext4 (and similar) filesystems the rename ack does not
guarantee durability across power loss — a crash can revert to a state
where the temp file is present and the target is at the old version.

Suggested by @jphein on #1215.
---
 mempalace/entity_registry.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py
index d77b6c1..c8ac517 100644
--- a/mempalace/entity_registry.py
+++ b/mempalace/entity_registry.py
@@ -336,6 +336,20 @@ class EntityRegistry:
         except (OSError, NotImplementedError):
             pass
         os.replace(tmp_path, self._path)
+        # On ext4 (and similar) the rename's durability across power loss
+        # requires an additional fsync on the parent directory. Without it,
+        # the kernel can ack the rename and a crash reverts to the state
+        # where the temp file is present and the target is at the old version.
+        try:
+            dir_fd = os.open(str(self._path.parent), os.O_RDONLY)
+            try:
+                os.fsync(dir_fd)
+            finally:
+                os.close(dir_fd)
+        except OSError:
+            # Windows and some special filesystems reject directory fds — they
+            # have different durability semantics on rename anyway.
+            pass
 
     @staticmethod
     def _empty() -> dict:

From bb40a529fde600ba01b49e6ead06820968bba446 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Tue, 5 May 2026 09:01:05 +0000
Subject: [PATCH 32/60] fix(migrate): verify write roundtrip before bailout

---
 mempalace/migrate.py  |  73 +++++++++++++++++++++++++++---
 tests/test_migrate.py | 101 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 7 deletions(-)

diff --git a/mempalace/migrate.py b/mempalace/migrate.py
index 76aa054..5b74591 100644
--- a/mempalace/migrate.py
+++ b/mempalace/migrate.py
@@ -22,6 +22,7 @@ import errno
 import os
 import shutil
 import sqlite3
+import uuid
 from collections import defaultdict
 from datetime import datetime
 
@@ -155,6 +156,55 @@ def confirm_destructive_action(
     return True
 
 
+def _result_ids(result) -> list:
+    """Return ids from either the backend typed result or raw Chroma dict."""
+
+    if isinstance(result, dict):
+        return list(result.get("ids") or [])
+
+    return list(getattr(result, "ids", []) or [])
+
+
+def collection_write_roundtrip_works(col) -> bool:
+    """Return True only if the collection can upsert, read, and delete.
+
+    Some ChromaDB 0.6.x -> 1.5.x migrated collections remain readable while
+    writes and deletes silently no-op. A plain ``count()`` probe misses that
+    failure mode, so migrate must verify an actual write round-trip before
+    deciding that no rebuild is needed.
+    """
+
+    probe_id = f"_mempalace_migrate_probe_{uuid.uuid4().hex}"
+    probe_doc = "mempalace migrate write round-trip probe"
+    probe_meta = {
+        "wing": "_mempalace_probe",
+        "room": "_mempalace_probe",
+        "source_file": "mempalace_migrate_probe",
+        "chunk_index": 0,
+    }
+
+    try:
+        col.upsert(
+            ids=[probe_id],
+            documents=[probe_doc],
+            metadatas=[probe_meta],
+        )
+
+        after_upsert = col.get(ids=[probe_id], include=[])
+        if probe_id not in _result_ids(after_upsert):
+            return False
+
+        col.delete(ids=[probe_id])
+
+        after_delete = col.get(ids=[probe_id], include=[])
+        if probe_id in _result_ids(after_delete):
+            return False
+
+        return True
+    except Exception:
+        return False
+
+
 def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
     """Migrate a palace to the currently installed ChromaDB version."""
     from .backends.chroma import ChromaBackend
@@ -179,16 +229,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
     print(f"  Source:    ChromaDB {source_version}")
     print(f"  Target:    ChromaDB {target_version}")
 
-    # Try reading with current chromadb first
+    # Try reading and writing with current chromadb first.
+    #
+    # A plain count() is not enough: some 0.6.x -> 1.5.x migrated collections
+    # are readable but silently drop upsert/delete operations. In that state,
+    # migrate must rebuild from SQLite instead of returning "No migration needed."
     try:
         col = ChromaBackend().get_collection(palace_path, "mempalace_drawers")
         count = col.count()
-        print(f"\n  Palace is already readable by chromadb {target_version}.")
-        print(f"  {count} drawers found. No migration needed.")
-        return True
+
+        if collection_write_roundtrip_works(col):
+            print(f"\n Palace is already readable and writable by chromadb {target_version}.")
+            print(f" {count} drawers found. No migration needed.")
+            return True
+
+        print(
+            f"\n Palace is readable by chromadb {target_version}, but write/delete verification failed."
+        )
+        print(" Rebuilding from SQLite to restore native write/delete behavior...")
     except Exception:
-        print(f"\n  Palace is NOT readable by chromadb {target_version}.")
-        print("  Extracting from SQLite directly...")
+        print(f"\n Palace is NOT readable by chromadb {target_version}.")
+        print(" Extracting from SQLite directly...")
 
     # Extract all drawers via raw SQL
     drawers = extract_drawers_from_sqlite(db_path)
diff --git a/tests/test_migrate.py b/tests/test_migrate.py
index 4701048..ba12ff5 100644
--- a/tests/test_migrate.py
+++ b/tests/test_migrate.py
@@ -4,7 +4,7 @@ import os
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 
-from mempalace.migrate import _restore_stale_palace, migrate
+from mempalace.migrate import collection_write_roundtrip_works, _restore_stale_palace, migrate
 
 
 def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -101,3 +101,102 @@ def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
     assert "CRITICAL" in out
     assert os.fspath(palace_path) in out
     assert os.fspath(stale_path) in out
+
+
+class _FakeGetResult:
+    def __init__(self, ids):
+        self.ids = ids
+
+
+class _WritableFakeCollection:
+    def __init__(self):
+        self.ids = set()
+        self.deleted = []
+
+    def upsert(self, *, ids, documents, metadatas):
+        self.ids.update(ids)
+
+    def get(self, *, ids, include=None):
+        return _FakeGetResult([drawer_id for drawer_id in ids if drawer_id in self.ids])
+
+    def delete(self, *, ids=None, where=None):
+        for drawer_id in ids or []:
+            self.ids.discard(drawer_id)
+            self.deleted.append(drawer_id)
+
+
+class _SilentWriteDropCollection(_WritableFakeCollection):
+    def upsert(self, *, ids, documents, metadatas):
+        return None
+
+
+class _SilentDeleteDropCollection(_WritableFakeCollection):
+    def delete(self, *, ids=None, where=None):
+        self.deleted.extend(ids or [])
+
+
+def test_collection_write_roundtrip_works_when_probe_persists_and_deletes():
+    col = _WritableFakeCollection()
+
+    assert collection_write_roundtrip_works(col) is True
+    assert col.ids == set()
+    assert len(col.deleted) == 1
+
+
+def test_collection_write_roundtrip_fails_when_upsert_silently_drops():
+    col = _SilentWriteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert col.ids == set()
+
+
+def test_collection_write_roundtrip_fails_when_delete_silently_drops():
+    col = _SilentDeleteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert len(col.ids) == 1
+
+
+def test_migrate_dry_run_rebuilds_when_collection_is_readable_but_not_writable(tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+
+    fake_col = MagicMock()
+    fake_col.count.return_value = 102
+
+    drawers = [
+        {
+            "id": "id1",
+            "document": "hello",
+            "metadata": {"wing": "test-wing", "room": "general"},
+        }
+    ]
+
+    with (
+        patch("mempalace.migrate.detect_chromadb_version", return_value="1.x"),
+        patch("mempalace.backends.chroma.ChromaBackend") as mock_backend,
+        patch(
+            "mempalace.migrate.collection_write_roundtrip_works", return_value=False
+        ) as mock_probe,
+        patch(
+            "mempalace.migrate.extract_drawers_from_sqlite", return_value=drawers
+        ) as mock_extract,
+    ):
+        mock_backend.backend_version.return_value = "1.5.8"
+        mock_backend.return_value.get_collection.return_value = fake_col
+
+        result = migrate(str(palace_dir), dry_run=True)
+
+    out = capsys.readouterr().out
+
+    assert result is True
+    mock_probe.assert_called_once_with(fake_col)
+    mock_extract.assert_called_once_with(
+        os.path.join(os.path.abspath(os.fspath(palace_dir)), "chroma.sqlite3")
+    )
+
+    assert "readable by chromadb 1.5.8, but write/delete verification failed" in out
+    assert "Rebuilding from SQLite" in out
+    assert "Extracted 1 drawers from SQLite" in out
+    assert "DRY RUN" in out

From 2c0ef2c04e8e1987c98dac87c4a5950f9f86c0f3 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:38:57 -0300
Subject: [PATCH 33/60] docs(changelog): document v3.3.5 fixes from #1214 #1105
 #1215 #1107 #1282 #1167 #1160

Bundled CHANGELOG entries for the seven Tier-1 PRs merged today, including
the behavior-change call-out for #1167 (KG date validators now reject
non-ISO inputs that previously produced silent empty results).
---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d3982fe..ba6822a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 ### Bug Fixes
 
 - **`mempalace_diary_read` silently dropped entries on agent-name case mismatch.** `tool_diary_write` stored the `agent` metadata verbatim after `sanitize_name`, which preserves case, while `tool_diary_read` filtered by exact match. Writing as `"Claude"` and reading as `"claude"` (or vice-versa) returned zero rows. Both endpoints now lowercase `agent_name` immediately after sanitization, so reads are case-insensitive and the default per-agent wing slug is stable across casings. **Behavior change:** entries written prior to this fix under mixed-case agent names will not match the new lowercase filter; run `mempalace repair` if you need to migrate legacy diary metadata. (#1243)
+- **Knowledge-graph triples with `valid_to < valid_from` were silently invisible.** `KnowledgeGraph.query_entity()` filters with `valid_from <= as_of AND valid_to >= as_of`, so an inverted interval matches no `as_of` and the row is durably stored but unreachable — a P0 data-integrity foot-gun any caller that mixes up the two date params can hit. `add_triple()` now rejects inverted intervals at write time with a clear `ValueError` naming both bounds. Open intervals (one bound only) and point-in-time facts (`valid_from == valid_to`) remain accepted unchanged. (#1214)
+- **`ChromaBackend.close_palace()` / `close()` did not release the SQLite file lock.** Evicted clients sat in `_clients` without `close()`, and chromadb 1.5.x retains the rust-side SQLite lock until GC. Reopening the same palace path after `shutil.rmtree` + recreate within one process failed with `SQLITE_READONLY_DBMOVED` (code 1032). New `_close_client()` helper now calls `PersistentClient.close()` (with a try/except fallback for older chromadb) on `close_palace()`, on whole-backend `close()`, and on the `_client()` invalidation path that detects a missing `chroma.sqlite3`. The mtime/inode auto-invalidation branch is intentionally left alone — callers there may still hold a live `ChromaCollection`. (#1067, #1105)
+- **`EntityRegistry.save()` could leave a corrupt or empty `entity_registry.json` on crash.** `Path.write_text()` is not atomic — kernel sees `open('w')` (truncate), `write`, `close`, and any failure between truncate and full-flush (power loss, OOM, FS-full, kill -9) wipes the months-of-mining people/projects map silently (the registry's `load()` swallows `JSONDecodeError`). Save now writes to a sibling `.tmp` in the same directory, `fsync`s, `chmod 0o600`s, then `os.replace()`s into place — atomic on POSIX and Windows. The previous registry stays intact on any crash before the rename returns. (#1215)
+- **`mempalace compress` crashed on large palaces.** `regenerate_closets` fetched all closet_llm drawers in a single `col.get()`, which trips `SQLITE_MAX_VARIABLE_NUMBER` on palaces above ~32k drawers. Mirrors the #851 fix in `miner.py`: drawer fetch is now paginated at `batch_size=5000`. Per-source aggregation works across batches, so the LLM regeneration call still groups chunks correctly. (#1073, #1107)
+- **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
+- **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)
+- **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
 
 ---
 

From d1e27b8c42f3892046bb20f950d7b2e04726e230 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:47:46 -0300
Subject: [PATCH 34/60] style: ruff format new test files (CI lint)

---
 tests/test_chroma_collection_lock.py | 18 ++++++------------
 tests/test_palace_locks.py           |  6 +++---
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/tests/test_chroma_collection_lock.py b/tests/test_chroma_collection_lock.py
index b5d30fb..536b5e8 100644
--- a/tests/test_chroma_collection_lock.py
+++ b/tests/test_chroma_collection_lock.py
@@ -255,12 +255,8 @@ def test_concurrent_writers_serialize(tmp_path, monkeypatch):
     ctx = _get_mp_context()
     result_q = ctx.Queue()
 
-    p1 = ctx.Process(
-        target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q)
-    )
-    p2 = ctx.Process(
-        target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q)
-    )
+    p1 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q))
+    p2 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q))
     p1.start()
     # Tiny stagger so p1 wins the race deterministically; without it the
     # OS scheduler can pick either, which is also a valid outcome but
@@ -272,9 +268,7 @@ def test_concurrent_writers_serialize(tmp_path, monkeypatch):
 
     outcomes = [result_q.get(timeout=1) for _ in range(2)]
     statuses = sorted(o[0] for o in outcomes)
-    assert statuses == ["busy", "ok"], (
-        f"expected one ok + one busy, got {outcomes}"
-    )
+    assert statuses == ["busy", "ok"], f"expected one ok + one busy, got {outcomes}"
 
 
 def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
@@ -319,9 +313,9 @@ def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
             if method is None:
                 continue
             src = inspect.getsource(method)
-            assert "_write_lock" not in src, (
-                f"{read_attr} must NOT acquire the write lock (read path)"
-            )
+            assert (
+                "_write_lock" not in src
+            ), f"{read_attr} must NOT acquire the write lock (read path)"
     finally:
         open(release, "w").close()
         holder.join(timeout=5)
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 39aa50c..d239757 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -194,9 +194,9 @@ def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
         child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
         child.start()
         child.join(timeout=5)
-        assert result_q.get(timeout=1) == "busy", (
-            "outer lock should still be held by parent after inner re-entrant exit"
-        )
+        assert (
+            result_q.get(timeout=1) == "busy"
+        ), "outer lock should still be held by parent after inner re-entrant exit"
 
 
 def _try_acquire_expect_busy(palace_path, result_q):

From f854da779fef5634e5719a45b59b3c18acb6270b Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:57:44 -0300
Subject: [PATCH 35/60] fix(lint): hoist hooks_cli_mod import to top of
 test_hooks_cli (E402)

The alias was placed below an explanatory comment block introduced by
#1305, which trips ruff E402 (module-level import not at top of file).
Moved next to the existing 'from mempalace.hooks_cli import (...)' line.

CI lint went red on develop after #1305 merged with the failing check;
this re-greens it so subsequent PRs do not inherit the failure.
---
 tests/test_hooks_cli.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 487acf7..19ecbaf 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch
 
 import pytest
 
+import mempalace.hooks_cli as hooks_cli_mod
 from mempalace.hooks_cli import (
     SAVE_INTERVAL,
     _count_human_messages,
@@ -969,9 +970,6 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
 # STATE_DIR.mkdir() on its own.
 
 
-import mempalace.hooks_cli as hooks_cli_mod
-
-
 def _redirect_palace_root(monkeypatch, tmp_path):
     """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
     fake_root = tmp_path / "absent-mempalace"

From 733e4353321678ea1a55c093ad1b3d1d5e8c5313 Mon Sep 17 00:00:00 2001
From: Chris Antenesse <cantenesse@predictap.com>
Date: Sat, 18 Apr 2026 15:08:01 -0500
Subject: [PATCH 36/60] fix(searcher): guard against None metadata/doc in
 search result loops

ChromaDB can return None entries in metadatas/documents lists under
partial-flush, mid-delete, upgrade-boundary, and interrupted-mine
states. Add `meta = meta or {}` and `doc = doc or ""` guards in the
three result loops (search display, closet hybrid, drawer scored) so
.get() and .strip() calls never crash on None.

Fixes #1007, #1011
---
 mempalace/searcher.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index d615623..16ea4eb 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -340,7 +340,7 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
     # `_hybrid_rank`; do the same here so CLI results match what agents
     # see via `mempalace_search`.
     hits = [
-        {"text": doc, "distance": float(dist), "metadata": meta or {}}
+        {"text": doc or "", "distance": float(dist), "metadata": meta or {}}
         for doc, meta, dist in zip(docs, metas, dists)
     ]
     hits = _hybrid_rank(hits, query)
@@ -809,6 +809,8 @@ def search_memories(
         _first_or_empty(drawer_results, "metadatas"),
         _first_or_empty(drawer_results, "distances"),
     ):
+        meta = meta or {}
+        doc = doc or ""
         # Filter on raw distance before rounding to avoid precision loss.
         if max_distance > 0.0 and dist > max_distance:
             continue

From 5347c2c71c782d04fc2023504f58dc52c2369187 Mon Sep 17 00:00:00 2001
From: eldar702 <eldarshlomi7@gmail.com>
Date: Sun, 19 Apr 2026 11:08:45 +0300
Subject: [PATCH 37/60] fix(searcher): clamp effective_distance to valid cosine
 range [0, 2]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``search_memories`` computes ``effective_dist = dist - boost`` where
``boost`` can be as large as ``CLOSET_RANK_BOOSTS[0] == 0.40`` for a
rank-0 closet hit. When the raw drawer distance is small — any
near-exact match — the subtraction goes negative.

Two downstream effects:

1. Line 418 returns ``round(max(0.0, 1 - effective_dist), 3)`` as
   ``similarity``. With ``effective_dist = -0.30`` that yields
   ``similarity = 1.30``, outside the documented ``[0, 1]`` range.
   The ``max(0.0, ...)`` only prevents negative similarities; it does
   not cap above 1.
2. Line 427 stores ``_sort_key: effective_dist`` and line 435 sorts
   ``scored`` ascending by that key. A negative key drops *below* the
   rest, so the strongest hybrid matches end up sorting after weaker
   ones — ranking inversion under the exact conditions hybrid retrieval
   is supposed to serve best.

Clamp ``effective_dist`` to the valid cosine-distance range ``[0, 2]``.
The boost still wins (closet-backed hit still ranks first), it just no
longer flips the order.

Test added: mock drawer_col (base dist 0.08 / 0.35 for two sources) +
closet_col (rank-0 closet for the 0.08 source) → assert all hits have
``0 <= similarity <= 1`` and ``0 <= effective_distance <= 2``, and that
the closet-boosted source still ranks first.

Relationship to other PRs:

* **#988** clamps the output ``similarity`` alone. That does not fix
  the sort-key inversion or the invalid ``effective_distance`` in the
  returned dict. This PR clamps at the arithmetic source so both
  downstream users of the value stay in range.
* Orthogonal to **#979** (``tool_check_duplicate`` negative similarity).
---
 mempalace/searcher.py  |  7 +++++-
 tests/test_searcher.py | 57 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index d615623..e99fc5d 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -825,7 +825,12 @@ def search_memories(
                 matched_via = "drawer+closet"
                 closet_preview = c_preview
 
-        effective_dist = dist - boost
+        # Clamp to the valid cosine-distance range [0, 2]. When a strong
+        # closet boost (up to 0.40) exceeds the raw distance, the subtraction
+        # can go negative — which (a) yields ``similarity > 1.0`` downstream
+        # and (b) makes the sort key land *below* ordinary positive distances,
+        # inverting the ranking so the best hybrid matches sort last.
+        effective_dist = max(0.0, min(2.0, dist - boost))
         entry = {
             "text": doc,
             "wing": meta.get("wing", "unknown"),
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 6b85832..ad60641 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -120,6 +120,63 @@ class TestSearchMemories:
         assert none_hit["wing"] == "unknown"
         assert none_hit["room"] == "unknown"
 
+    def test_effective_distance_clamped_to_valid_cosine_range(self):
+        """A strong closet boost (up to 0.40) applied to a low-distance drawer
+        can drive ``dist - boost`` negative. That violates the cosine-distance
+        invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
+        internal ``_sort_key`` sinks below ordinary positive distances,
+        inverting the ranking so the best hybrid matches sort last.
+
+        With the clamp, ``effective_distance`` stays in ``[0, 2]``,
+        ``similarity`` stays in ``[0, 1]``, and the sort order is stable.
+        """
+        # Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
+        # Drawer b.md gets a larger base distance (0.35).
+        drawers_col = MagicMock()
+        drawers_col.query.return_value = {
+            "documents": [["doc-a", "doc-b"]],
+            "metadatas": [[
+                {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+            ]],
+            "distances": [[0.08, 0.35]],
+            "ids": [["d-a", "d-b"]],
+        }
+        # A strong closet at rank 0 points at a.md → boost = 0.40,
+        # which exceeds a.md's base distance and would go negative without
+        # the clamp. No closet for b.md.
+        closets_col = MagicMock()
+        closets_col.query.return_value = {
+            "documents": [["closet-preview-a"]],
+            "metadatas": [[{"source_file": "a.md"}]],
+            "distances": [[0.2]],  # within CLOSET_DISTANCE_CAP (1.5)
+            "ids": [["c-a"]],
+        }
+
+        with (
+            patch("mempalace.searcher.get_collection", return_value=drawers_col),
+            patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
+        ):
+            result = search_memories("query", "/fake/path", n_results=5)
+
+        hits = result["results"]
+        assert hits, "should return results"
+
+        # Invariants on every hit.
+        for h in hits:
+            assert 0.0 <= h["similarity"] <= 1.0, (
+                f"similarity out of range: {h['similarity']} for {h['source_file']}"
+            )
+            assert 0.0 <= h["effective_distance"] <= 2.0, (
+                f"effective_distance out of range: {h['effective_distance']} "
+                f"for {h['source_file']}"
+            )
+
+        # With the clamp, the closet-boosted a.md still ranks ahead of b.md —
+        # the boost still wins, but it no longer flips the ranking.
+        assert hits[0]["source_file"] == "a.md"
+        assert hits[0]["matched_via"] == "drawer+closet"
+
 
 # ── BM25 internals: None / empty document safety ─────────────────────
 

From aac8437979a562ab5b0c14541a22da4996e03250 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:53:59 -0300
Subject: [PATCH 38/60] style: ruff format tests/test_searcher.py (CI lint)

---
 tests/test_searcher.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index ad60641..4f0b4c0 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -135,10 +135,12 @@ class TestSearchMemories:
         drawers_col = MagicMock()
         drawers_col.query.return_value = {
             "documents": [["doc-a", "doc-b"]],
-            "metadatas": [[
-                {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
-                {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
-            ]],
+            "metadatas": [
+                [
+                    {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                    {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+                ]
+            ],
             "distances": [[0.08, 0.35]],
             "ids": [["d-a", "d-b"]],
         }
@@ -164,9 +166,9 @@ class TestSearchMemories:
 
         # Invariants on every hit.
         for h in hits:
-            assert 0.0 <= h["similarity"] <= 1.0, (
-                f"similarity out of range: {h['similarity']} for {h['source_file']}"
-            )
+            assert (
+                0.0 <= h["similarity"] <= 1.0
+            ), f"similarity out of range: {h['similarity']} for {h['source_file']}"
             assert 0.0 <= h["effective_distance"] <= 2.0, (
                 f"effective_distance out of range: {h['effective_distance']} "
                 f"for {h['source_file']}"

From 0fdb480e12caaf894838886ca6db2ad68e3d67b6 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 01:55:43 +0300
Subject: [PATCH 39/60] fix(mcp): handle null JSON-RPC request payloads safely

When the MCP client sends a malformed or null top-level request, prevent the AttributeError on request.get() by explicitly validating that the request is a dictionary. Returns standard JSON-RPC Error -32600 (Invalid Request) instead of crashing the server.
---
 mempalace/mcp_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..71a96df 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1968,6 +1968,8 @@ SUPPORTED_PROTOCOL_VERSIONS = [
 
 
 def handle_request(request):
+    if not isinstance(request, dict):
+        return {"jsonrpc": "2.0", "error": {"code": -32600, "message": "Invalid Request"}}
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")

From 55d79dc8cd03d408644b05cb44aa277edaf2eb86 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 16:53:53 +0300
Subject: [PATCH 40/60] fix: include null id in JSON-RPC invalid request error
 responses and add validation tests

---
 mempalace/mcp_server.py  | 2 +-
 tests/test_mcp_server.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 71a96df..227e91a 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1969,7 +1969,7 @@ SUPPORTED_PROTOCOL_VERSIONS = [
 
 def handle_request(request):
     if not isinstance(request, dict):
-        return {"jsonrpc": "2.0", "error": {"code": -32600, "message": "Invalid Request"}}
+        return {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index b036afd..0a020d3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -190,6 +190,13 @@ class TestHandleRequest:
         resp = handle_request({"method": None, "id": 99, "params": {}})
         assert resp["error"]["code"] == -32601
 
+    @pytest.mark.parametrize("payload", [None, [], "plain", 42, True])
+    def test_handle_request_invalid_payload_returns_jsonrpc_error(self, payload):
+        from mempalace.mcp_server import handle_request
+
+        resp = handle_request(payload)
+        assert resp == {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+
     def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)
         from mempalace.mcp_server import handle_request

From a85d432b544060795ff57796d713c597122f6293 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 22:05:34 +0300
Subject: [PATCH 41/60] feat: add validation for missing name parameter in
 tools/call requests

---
 mempalace/mcp_server.py  |  6 ++++++
 tests/test_mcp_server.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 227e91a..25439f2 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2007,6 +2007,12 @@ def handle_request(request):
             },
         }
     elif method == "tools/call":
+        if not isinstance(params, dict) or "name" not in params:
+            return {
+                "jsonrpc": "2.0",
+                "id": req_id,
+                "error": {"code": -32602, "message": "Invalid params: 'name' is required for tools/call"},
+            }
         tool_name = params.get("name")
         tool_args = params.get("arguments") or {}
         if tool_name not in TOOLS:
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 0a020d3..fc56b34 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -148,6 +148,20 @@ class TestHandleRequest:
         )
         assert resp["error"]["code"] == -32601
 
+    def test_tools_call_missing_params(self):
+        from mempalace.mcp_server import handle_request
+
+        for bad_params in [None, {}, {"arguments": {}}]:
+            resp = handle_request(
+                {
+                    "method": "tools/call",
+                    "id": 15,
+                    "params": bad_params,
+                }
+            )
+            assert resp["error"]["code"] == -32602
+            assert "Invalid params" in resp["error"]["message"]
+
     def test_unknown_method(self):
         from mempalace.mcp_server import handle_request
 

From 869ab3809570fa578e9dd634ec31b0b1817dddc7 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:54:11 -0300
Subject: [PATCH 42/60] style: ruff format mcp_server.py + test_mcp_server.py
 (CI lint)

---
 mempalace/mcp_server.py  | 11 +++++++++--
 tests/test_mcp_server.py |  6 +++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 25439f2..30a0bea 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1969,7 +1969,11 @@ SUPPORTED_PROTOCOL_VERSIONS = [
 
 def handle_request(request):
     if not isinstance(request, dict):
-        return {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+        return {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")
@@ -2011,7 +2015,10 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "error": {"code": -32602, "message": "Invalid params: 'name' is required for tools/call"},
+                "error": {
+                    "code": -32602,
+                    "message": "Invalid params: 'name' is required for tools/call",
+                },
             }
         tool_name = params.get("name")
         tool_args = params.get("arguments") or {}
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index fc56b34..c073830 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -209,7 +209,11 @@ class TestHandleRequest:
         from mempalace.mcp_server import handle_request
 
         resp = handle_request(payload)
-        assert resp == {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+        assert resp == {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
 
     def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)

From 7b49478ef7049d124ae6f35e55da2f77cadfddf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E7=A5=96=E9=91=AB=28940219=29?= <940219@nd.com.cn>
Date: Fri, 1 May 2026 20:46:36 +0800
Subject: [PATCH 43/60] fix: MCP server JSON output ensure_ascii=False for
 non-ASCII support

Without ensure_ascii=False, non-ASCII characters (e.g. Chinese) in tool
results and JSON-RPC responses are escaped as \uXXXX, which causes
downstream MCP clients to receive escaped text instead of the original
characters. This affects all platforms, not just Windows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 mempalace/mcp_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..f0973d3 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2053,7 +2053,7 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]},
+                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}]},
             }
         except Exception:
             logger.exception(f"Tool error in {tool_name}")
@@ -2114,7 +2114,7 @@ def main():
             request = json.loads(line)
             response = handle_request(request)
             if response is not None:
-                sys.stdout.write(json.dumps(response) + "\n")
+                sys.stdout.write(json.dumps(response, ensure_ascii=False) + "\n")
                 sys.stdout.flush()
         except KeyboardInterrupt:
             break

From 74288f1cdd92f6717186865caa1412d43a13fbc7 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:54:24 -0300
Subject: [PATCH 44/60] style: ruff format mcp_server.py (CI lint)

---
 mempalace/mcp_server.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index f0973d3..fa69aa5 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2053,7 +2053,11 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}]},
+                "result": {
+                    "content": [
+                        {"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}
+                    ]
+                },
             }
         except Exception:
             logger.exception(f"Tool error in {tool_name}")

From eef053d75093645faae6348c08b17d1f6d733a5e Mon Sep 17 00:00:00 2001
From: bobo-xxx <111567133+bobo-xxx@users.noreply.github.com>
Date: Sat, 18 Apr 2026 12:40:58 +0800
Subject: [PATCH 45/60] fix(mcp_server): clamp similarity to [0,1] to avoid
 negative values

---
 mempalace/mcp_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..dbe4497 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -740,7 +740,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
         if results["ids"] and results["ids"][0]:
             for i, drawer_id in enumerate(results["ids"][0]):
                 dist = results["distances"][0][i]
-                similarity = round(1 - dist, 3)
+                similarity = round(max(0.0, 1 - dist), 3)
                 if similarity >= threshold:
                     # Chroma 1.5.x can return None for partially-flushed rows;
                     # coerce to empty sentinels so downstream .get() is safe.

From f2bed9284fccaec51dd78266f9113db41a9e966f Mon Sep 17 00:00:00 2001
From: bobo-xxx <111567133+bobo-xxx@users.noreply.github.com>
Date: Sat, 18 Apr 2026 12:41:12 +0800
Subject: [PATCH 46/60] fix(layers): clamp similarity to [0,1] to avoid
 negative values

---
 mempalace/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/layers.py b/mempalace/layers.py
index b20c656..d549afe 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -287,7 +287,7 @@ class Layer3:
         for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
             meta = meta or {}
             doc = doc or ""
-            similarity = round(1 - dist, 3)
+            similarity = round(max(0.0, 1 - dist), 3)
             wing_name = meta.get("wing", "?")
             room_name = meta.get("room", "?")
             source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""

From b68485dfd4673848bcd44064a0c8764d89f6b48a Mon Sep 17 00:00:00 2001
From: Anthony Clendenen <anthony.clendenen@gmail.com>
Date: Thu, 23 Apr 2026 13:33:28 -0700
Subject: [PATCH 47/60] fix(closet_llm): reject non-http(s) endpoints

LLMConfig accepted any URL scheme from LLM_ENDPOINT / --endpoint,
so a misconfigured endpoint such as file:///etc/passwd would be
passed straight to urllib.request.urlopen. Validate the scheme at
construction time and raise ValueError on anything other than
http/https, preserving the "privacy by architecture" guarantee.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/closet_llm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mempalace/closet_llm.py b/mempalace/closet_llm.py
index 6274f79..50000c8 100644
--- a/mempalace/closet_llm.py
+++ b/mempalace/closet_llm.py
@@ -40,6 +40,7 @@ import json
 import os
 import re
 import time
+import urllib.parse
 import urllib.request
 import urllib.error
 from datetime import datetime
@@ -101,6 +102,14 @@ class LLMConfig:
         self.endpoint = (endpoint or os.environ.get("LLM_ENDPOINT", "")).rstrip("/")
         self.key = key or os.environ.get("LLM_KEY", "")
         self.model = model or os.environ.get("LLM_MODEL", "")
+        if self.endpoint:
+            # Privacy-by-architecture: reject file:// and other non-HTTP schemes
+            # so a misconfigured endpoint cannot exfiltrate local files.
+            scheme = urllib.parse.urlparse(self.endpoint).scheme.lower()
+            if scheme not in ("http", "https"):
+                raise ValueError(
+                    f"LLM_ENDPOINT must use http:// or https:// (got scheme {scheme!r})"
+                )
 
     def missing(self) -> list:
         missing = []

From ca5899e361a1bc8823145f6d1efad22f22639409 Mon Sep 17 00:00:00 2001
From: Anthony Clendenen <anthony.clendenen@gmail.com>
Date: Thu, 23 Apr 2026 13:33:38 -0700
Subject: [PATCH 48/60] refactor: fix ruff bugbear and silent-except findings

- B904: chain OSError/collection errors with "raise ... from e" in
  normalize.py and searcher.py so the original traceback is preserved.
- B007: rename unused loop variables to _name in dedup, dialect, layers,
  and room_detector_local.
- S110/S112: replace bare "try/except/pass" and "try/except/continue"
  with logger.debug(..., exc_info=True) in mcp_server, searcher,
  palace, palace_graph, miner, convo_miner, and fact_checker so
  background failures are observable without changing behaviour.

A module-level logger ("mempalace_mcp", matching mcp_server/searcher)
is added to the five files that didn't already have one. Configured
ruff checks (E/F/W/C901) and ruff --select B, S110, S112 all pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/convo_miner.py         |  5 ++++-
 mempalace/dedup.py               |  2 +-
 mempalace/dialect.py             |  2 +-
 mempalace/fact_checker.py        |  4 ++++
 mempalace/layers.py              |  2 +-
 mempalace/mcp_server.py          |  4 ++--
 mempalace/miner.py               |  5 ++++-
 mempalace/normalize.py           |  4 ++--
 mempalace/palace.py              |  7 +++++--
 mempalace/palace_graph.py        |  2 +-
 mempalace/room_detector_local.py |  2 +-
 mempalace/searcher.py            | 10 ++++++----
 12 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
index 2cf57e4..915b4d1 100644
--- a/mempalace/convo_miner.py
+++ b/mempalace/convo_miner.py
@@ -11,6 +11,7 @@ Same palace as project mining. Different ingest strategy.
 import os
 import sys
 import hashlib
+import logging
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
@@ -24,6 +25,8 @@ from .palace import (
     mine_lock,
 )
 
+logger = logging.getLogger("mempalace_mcp")
+
 
 # Cached hall keywords — avoids re-reading config per drawer
 _HALL_KEYWORDS_CACHE = None
@@ -331,7 +334,7 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
         try:
             collection.delete(where={"source_file": source_file})
         except Exception:
-            pass
+            logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
 
         # Batch chunks into bounded upserts so large transcripts keep most of
         # the embedding speedup without one huge Chroma/SQLite request. Keep
diff --git a/mempalace/dedup.py b/mempalace/dedup.py
index 6b1bac1..5e57aff 100644
--- a/mempalace/dedup.py
+++ b/mempalace/dedup.py
@@ -89,7 +89,7 @@ def dedup_source_group(col, drawer_ids, threshold=DEFAULT_THRESHOLD, dry_run=Tru
     kept = []
     to_delete = []
 
-    for did, doc, meta in items:
+    for did, doc, _meta in items:
         if not doc or len(doc) < 20:
             to_delete.append(did)
             continue
diff --git a/mempalace/dialect.py b/mempalace/dialect.py
index b72c52c..e6e214c 100644
--- a/mempalace/dialect.py
+++ b/mempalace/dialect.py
@@ -873,7 +873,7 @@ class Dialect:
 
         for date_key in sorted(by_date.keys()):
             lines.append(f"=MOMENTS[{date_key}]=")
-            for z, fnum in by_date[date_key]:
+            for z, _fnum in by_date[date_key]:
                 entities = []
                 for p in z.get("people", []):
                     code = self.encode_entity(p)
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 403d913..8f1c3ba 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -27,6 +27,7 @@ Usage:
 
 from __future__ import annotations
 
+import logging
 import os
 import re
 from datetime import datetime, timezone
@@ -35,6 +36,8 @@ from datetime import datetime, timezone
 # ~/.mempalace/known_entities.json on every check_text call.
 from .miner import _load_known_entities_raw
 
+logger = logging.getLogger("mempalace_mcp")
+
 
 # Narrow detection patterns — parse "X is Y's Z" and "X's Z is Y".
 # Names are captured greedily as word sequences (letters + optional
@@ -214,6 +217,7 @@ def _check_kg_contradictions(text: str, palace_path: str) -> list:
         try:
             facts = kg.query_entity(subject, direction="outgoing")
         except Exception:
+            logger.debug("KG lookup failed for subject %r", subject, exc_info=True)
             continue
         if not facts:
             continue
diff --git a/mempalace/layers.py b/mempalace/layers.py
index d549afe..b92890a 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -157,7 +157,7 @@ class Layer1:
             lines.append(room_line)
             total_len += len(room_line)
 
-            for imp, meta, doc in entries:
+            for _imp, meta, doc in entries:
                 source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
 
                 # Truncate doc to keep L1 compact
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 46982bb..58f9ba9 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -900,7 +900,7 @@ def tool_add_drawer(
         if existing and existing["ids"]:
             return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
     except Exception:
-        pass
+        logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
 
     try:
         col.upsert(
@@ -1418,7 +1418,7 @@ def tool_hook_settings(silent_save: bool = None, desktop_toast: bool = None):
     try:
         config = MempalaceConfig()
     except Exception:
-        pass
+        logger.debug("Could not re-read config after update", exc_info=True)
 
     result = {
         "success": True,
diff --git a/mempalace/miner.py b/mempalace/miner.py
index ba0c630..88734c9 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -12,6 +12,7 @@ import sys
 import shlex
 import hashlib
 import fnmatch
+import logging
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
@@ -31,6 +32,8 @@ from .palace import (
     upsert_closet_lines,
 )
 
+logger = logging.getLogger("mempalace_mcp")
+
 READABLE_EXTENSIONS = {
     ".txt",
     ".md",
@@ -842,7 +845,7 @@ def process_file(
         try:
             collection.delete(where={"source_file": source_file})
         except Exception:
-            pass
+            logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
 
         # Batch chunks into bounded upserts so the embedding model sees many
         # chunks per forward pass without building one huge Chroma/SQLite
diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 4252afa..ca62cca 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -118,14 +118,14 @@ def normalize(filepath: str) -> str:
     try:
         file_size = os.path.getsize(filepath)
     except OSError as e:
-        raise IOError(f"Could not read {filepath}: {e}")
+        raise IOError(f"Could not read {filepath}: {e}") from e
     if file_size > 500 * 1024 * 1024:  # 500 MB safety limit
         raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}")
     try:
         with open(filepath, "r", encoding="utf-8", errors="replace") as f:
             content = f.read()
     except OSError as e:
-        raise IOError(f"Could not read {filepath}: {e}")
+        raise IOError(f"Could not read {filepath}: {e}") from e
 
     if not content.strip():
         return content
diff --git a/mempalace/palace.py b/mempalace/palace.py
index 97f67ff..e5f6411 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -6,12 +6,15 @@ Consolidates collection access patterns used by both miners and the MCP server.
 
 import contextlib
 import hashlib
+import logging
 import os
 import re
 import threading
 
 from .backends.chroma import ChromaBackend
 
+logger = logging.getLogger("mempalace_mcp")
+
 SKIP_DIRS = {
     ".git",
     "node_modules",
@@ -229,7 +232,7 @@ def purge_file_closets(closets_col, source_file: str) -> None:
     try:
         closets_col.delete(where={"source_file": source_file})
     except Exception:
-        pass
+        logger.debug("Closet purge failed for %s", source_file, exc_info=True)
 
 
 def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
@@ -307,7 +310,7 @@ def mine_lock(source_file: str):
 
                 fcntl.flock(lf, fcntl.LOCK_UN)
         except Exception:
-            pass
+            logger.debug("Mine-lock release failed", exc_info=True)
         lf.close()
 
 
diff --git a/mempalace/palace_graph.py b/mempalace/palace_graph.py
index 3296cd5..0fff763 100644
--- a/mempalace/palace_graph.py
+++ b/mempalace/palace_graph.py
@@ -575,7 +575,7 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
                     if did and did in drawer_map:
                         c["drawer_preview"] = drawer_map[did][:300]
             except Exception:
-                pass
+                logger.debug("Drawer preview hydration failed", exc_info=True)
 
     return connections
 
diff --git a/mempalace/room_detector_local.py b/mempalace/room_detector_local.py
index 31d5b05..8e3fc20 100644
--- a/mempalace/room_detector_local.py
+++ b/mempalace/room_detector_local.py
@@ -202,7 +202,7 @@ def detect_rooms_from_files(project_dir: str) -> list:
 
     SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build"}
 
-    for root, dirs, filenames in os.walk(project_path):
+    for _root, dirs, filenames in os.walk(project_path):
         dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
         for filename in filenames:
             name_lower = filename.lower().replace("-", "_").replace(" ", "_")
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index ddddc46..536610e 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -245,7 +245,7 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
         all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
         total_drawers = len(all_meta.ids) if all_meta.ids else None
     except Exception:
-        pass
+        logger.debug("total_drawers lookup failed for %s", src, exc_info=True)
 
     return {
         "text": combined_text,
@@ -297,10 +297,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
     """
     try:
         col = get_collection(palace_path, create=False)
-    except Exception:
+    except Exception as e:
         print(f"\n  No palace found at {palace_path}")
         print("  Run: mempalace init <dir> then mempalace mine <dir>")
-        raise SearchError(f"No palace found at {palace_path}")
+        raise SearchError(f"No palace found at {palace_path}") from e
 
     # Alert the user if this palace predates hnsw:space=cosine being set on
     # creation — their similarity scores will be junk until they run repair.
@@ -795,7 +795,8 @@ def search_memories(
             if source and source not in closet_boost_by_source:
                 closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
     except Exception:
-        pass  # no closets yet — hybrid degrades to pure drawer search
+        # No closets yet — hybrid degrades to pure drawer search.
+        logger.debug("Closet collection unavailable; using drawer-only search", exc_info=True)
 
     # Rank-based boost. The ordinal signal ("which closet matched best") is
     # more reliable than absolute distance on narrative content, where
@@ -877,6 +878,7 @@ def search_memories(
                 include=["documents", "metadatas"],
             )
         except Exception:
+            logger.debug("Neighbor fetch failed for %s", full_source, exc_info=True)
             continue
         docs = source_drawers.documents
         metas_ = source_drawers.metadatas

From a7c4ed24d7fd222cafb25af915fa3cef87e92b27 Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 00:25:46 -0500
Subject: [PATCH 49/60] fix(repair): add --mode from-sqlite to recover palaces
 with corrupt HNSW (#1308)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both `--mode legacy` and the inline `cli.cmd_repair` rebuild path
call `Collection.count()` as their first read — the same call that
raises `chromadb.errors.InternalError: Failed to apply logs to the
hnsw segment writer` on the corruption class reported in #1308.
Repair would print "Cannot recover — palace may need to be re-mined
from source files" even though the underlying SQLite tables were
fully intact.

The new `--mode from-sqlite` reads `(id, document, metadata)` rows
directly from `chroma.sqlite3` via `segments` → `embeddings` →
`embedding_metadata` joins, never opens a chromadb client against
the corrupt palace, and re-upserts everything into a fresh palace.

  - `--source PATH` extracts from a corrupt palace already moved aside
  - `--archive-existing` handles the in-place case by renaming the
    existing palace to `<palace>.pre-rebuild-<timestamp>` first
  - Partial-rebuild failures raise `RebuildPartialError` with the
    archive path so users can recover; CLI exits non-zero
  - In-place mode calls `SharedSystemClient.clear_system_cache()` to
    drop chromadb's process-wide System registry (cross-palace use
    does not, to limit blast radius for library callers)
  - Source validation runs before any destructive moves

Verified end-to-end recovering a 52,300-row real-world corrupt
palace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md         |   1 +
 mempalace/cli.py     |  68 +++++++-
 mempalace/repair.py  | 358 ++++++++++++++++++++++++++++++++++++++++++-
 tests/test_repair.py | 291 +++++++++++++++++++++++++++++++++++
 4 files changed, 714 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3011aab..1972c03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 - **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
 - **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)
 - **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
+- **`mempalace repair` can now recover palaces whose HNSW segment writer is stuck on `apply_logs`.** Both the existing `--mode legacy` rebuild and the inline `cli.cmd_repair` path call `Collection.count()` as their first read — exactly the call that raises `chromadb.errors.InternalError: Failed to apply logs to the hnsw segment writer` on the corruption class introduced upstream and reported in #1308. Repair would print `Cannot recover — palace may need to be re-mined from source files` even though the underlying SQLite tables were fully intact (the corruption lives in the on-disk index files, not the data layer). New `--mode from-sqlite` reads `(id, document, metadata)` rows directly from `chroma.sqlite3` via a `segments` → `embeddings` → `embedding_metadata` join, never opens a chromadb client against the corrupt palace, and re-upserts everything into a fresh palace at `--palace`. `--source PATH` extracts from a corrupt palace already moved aside; `--archive-existing` handles the in-place case by renaming the existing palace to `<palace>.pre-rebuild-<timestamp>` before reading from it. Documents are re-embedded under the user's configured embedding function (the original HNSW vectors live in the corrupt `data_level0.bin` and cannot be recovered, but the embedding model is deterministic so search results remain semantically equivalent). Verified end-to-end on a 52,300-row real-world corrupt palace. (#1308)
 
 ---
 
diff --git a/mempalace/cli.py b/mempalace/cli.py
index 0ab3d0f..468b765 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -673,6 +673,48 @@ def cmd_repair(args):
         )
         return
 
+    if getattr(args, "mode", "legacy") == "from-sqlite":
+        from .migrate import confirm_destructive_action
+        from .repair import RebuildPartialError, rebuild_from_sqlite
+
+        source_path = getattr(args, "source", None)
+        source_path = (
+            os.path.abspath(os.path.expanduser(source_path)) if source_path else palace_path
+        )
+        archive_existing = getattr(args, "archive_existing", False)
+
+        # Gate any path that touches the user's existing palace dir
+        # behind confirm_destructive_action. The legacy mode already
+        # gates; from-sqlite needs the same protection because:
+        # (a) --archive-existing renames the existing palace,
+        # (b) --source PATH writes into --palace dir which the user
+        #     may not realize is also a palace.
+        # No prompt when source != dest AND dest does not exist (pure
+        # extract-into-fresh-dir case is non-destructive to existing
+        # palaces).
+        is_destructive_to_dest = source_path == palace_path or os.path.exists(palace_path)
+        if is_destructive_to_dest and not confirm_destructive_action(
+            "Rebuild from SQLite", palace_path, assume_yes=getattr(args, "yes", False)
+        ):
+            return
+
+        try:
+            rebuild_from_sqlite(
+                source_palace=source_path,
+                dest_palace=palace_path,
+                archive_existing_dest=archive_existing,
+            )
+        except RebuildPartialError as exc:
+            # The error itself was already printed by rebuild_from_sqlite
+            # with recovery instructions; surface a non-zero exit so
+            # scripts and CI gates see the failure.
+            print(
+                "\n  Rebuild partial — see message above. "
+                f"Failed in collection: {exc.failed_collection}"
+            )
+            sys.exit(1)
+        return
+
     db_path = os.path.join(palace_path, "chroma.sqlite3")
 
     if not os.path.isdir(palace_path):
@@ -1213,11 +1255,31 @@ def main():
     )
     p_repair.add_argument(
         "--mode",
-        choices=["legacy", "max-seq-id"],
+        choices=["legacy", "max-seq-id", "from-sqlite"],
         default="legacy",
         help=(
-            "legacy: full-palace rebuild (default). "
-            "max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim."
+            "legacy: full-palace rebuild via the chromadb client (default). "
+            "max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim. "
+            "from-sqlite: rebuild by reading rows directly from chroma.sqlite3, "
+            "bypassing the chromadb client. Use when legacy mode bails because the "
+            "chromadb client cannot open the collection."
+        ),
+    )
+    p_repair.add_argument(
+        "--source",
+        default=None,
+        help=(
+            "Source palace path for --mode from-sqlite (defaults to --palace). "
+            "Use when extracting from an archived corrupt palace into a new location."
+        ),
+    )
+    p_repair.add_argument(
+        "--archive-existing",
+        action="store_true",
+        help=(
+            "For --mode from-sqlite when --source equals --palace: rename the "
+            "existing palace to <palace>.pre-rebuild-<timestamp> before "
+            "rebuilding so the corrupt copy is preserved."
         ),
     )
     p_repair.add_argument(
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..7e98f0f 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -34,14 +34,21 @@ import os
 import shutil
 import sqlite3
 import time
+from collections import defaultdict
 from datetime import datetime
-from typing import Optional
+from typing import Iterator, Optional
 
 from .backends.chroma import ChromaBackend, hnsw_capacity_status
 
 
 COLLECTION_NAME = "mempalace_drawers"
 
+# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
+# upsert pass — drawers carry the bulk of the data, closets are the AAAK
+# index layer and reference drawer IDs by string in their documents (no
+# foreign-key validation, so ordering is informational, not load-bearing).
+RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
+
 
 def _get_palace_path():
     """Resolve palace path from config."""
@@ -436,6 +443,355 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     print(f"\n{'=' * 55}\n")
 
 
+class RebuildPartialError(Exception):
+    """Raised when ``rebuild_from_sqlite`` fails partway through upserts.
+
+    Carries enough state for the user (or CLI) to recover: the
+    per-collection counts that succeeded, the collection that failed,
+    the dest path holding the partial palace, and the archive path
+    (when an in-place rebuild had moved the original aside). Re-raises
+    the underlying chromadb error as ``__cause__``.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        partial_counts: dict[str, int],
+        failed_collection: str,
+        dest_palace: str,
+        archive_path: Optional[str],
+    ):
+        super().__init__(message)
+        self.message = message
+        self.partial_counts = partial_counts
+        self.failed_collection = failed_collection
+        self.dest_palace = dest_palace
+        self.archive_path = archive_path
+
+
+def _rebuild_one_collection(
+    *,
+    backend: ChromaBackend,
+    source_palace: str,
+    dest_palace: str,
+    collection_name: str,
+    batch_size: int,
+    archive_path: Optional[str],
+    counts_so_far: dict[str, int],
+) -> int:
+    """Stream rows for one collection from SQLite and upsert into a
+    freshly-created collection at ``dest_palace``. Returns rows
+    upserted. Raises :class:`RebuildPartialError` (with the underlying
+    chromadb exception as ``__cause__``) on any upsert failure so the
+    caller can stop the loop and print recovery instructions instead of
+    silently shipping a partial palace.
+    """
+    col = backend.create_collection(dest_palace, collection_name)
+
+    ids: list[str] = []
+    docs: list[str] = []
+    metas: list[dict] = []
+    upserted = 0
+
+    def _flush() -> int:
+        nonlocal upserted
+        if not ids:
+            return upserted
+        col.upsert(ids=list(ids), documents=list(docs), metadatas=list(metas))
+        upserted += len(ids)
+        print(f"    upserted {upserted}")
+        ids.clear()
+        docs.clear()
+        metas.clear()
+        return upserted
+
+    try:
+        for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
+            ids.append(emb_id)
+            docs.append(doc or "")
+            # chromadb 1.5.x rejects None entries in the metadatas list
+            # but accepts empty dicts. Mempalace drawers always carry at
+            # least wing/room, so this branch is defensive — corruption
+            # in embedding_metadata could yield an emb_id with no rows.
+            metas.append(meta if meta else {})
+            if len(ids) >= batch_size:
+                _flush()
+        _flush()
+    except Exception as exc:  # noqa: BLE001 — chromadb raises many shapes
+        partial = dict(counts_so_far)
+        partial[collection_name] = upserted
+        msg_parts = [
+            f"Upsert failed in collection {collection_name!r} after {upserted} rows: {exc!r}",
+            f"Partial palace left at: {dest_palace}",
+        ]
+        if archive_path is not None:
+            msg_parts.append(f"Original palace archived at: {archive_path}")
+            msg_parts.append(
+                "  Recover by removing the partial dest and re-running with "
+                f"--source {archive_path}"
+            )
+        else:
+            msg_parts.append("  Source palace is unchanged. Remove the partial dest and re-run.")
+        message = "\n  ".join(msg_parts)
+        print(f"\n  ERROR: {message}")
+        raise RebuildPartialError(
+            message,
+            partial_counts=partial,
+            failed_collection=collection_name,
+            dest_palace=dest_palace,
+            archive_path=archive_path,
+        ) from exc
+
+    return upserted
+
+
+def extract_via_sqlite(palace_path: str, collection_name: str) -> Iterator[tuple[str, str, dict]]:
+    """Yield ``(embedding_id, document, metadata)`` for every row in
+    ``collection_name``'s metadata segment by reading ``chroma.sqlite3``
+    directly.
+
+    Bypasses the chromadb client entirely — never opens a
+    ``PersistentClient``, never imports hnswlib, never invokes the
+    HNSW segment writer. This is the recovery path for palaces where
+    ``Collection.count()`` / ``Collection.get()`` raise ``InternalError``
+    because the compactor cannot apply WAL logs to the HNSW segment
+    (#1308). The drawer rows are still on disk in
+    ``embeddings`` + ``embedding_metadata``; the corruption lives in the
+    on-disk index files, not the SQLite tables.
+
+    Resolution rule for chromadb's typed metadata columns: each
+    ``embedding_metadata`` row stores its value in exactly one of
+    ``string_value`` / ``int_value`` / ``float_value`` / ``bool_value``;
+    we pick the first non-NULL column in that order. Rows where every
+    typed column is NULL are dropped (chromadb never writes that shape).
+    The ``chroma:document`` key is removed from the metadata dict and
+    returned as the document; this matches how chromadb itself stores
+    ``add(documents=...)``.
+
+    Silent on missing palace, missing ``chroma.sqlite3``, or unknown
+    collection name — yields nothing. Callers that need to distinguish
+    "empty collection" from "collection not present" should query
+    :func:`sqlite_drawer_count` first.
+    """
+    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+    if not os.path.isfile(sqlite_path):
+        return
+
+    conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+    try:
+        seg_row = conn.execute(
+            """
+            SELECT s.id FROM segments s
+            JOIN collections c ON s.collection = c.id
+            WHERE c.name = ? AND s.scope = 'METADATA'
+            """,
+            (collection_name,),
+        ).fetchone()
+        if not seg_row:
+            return
+        segment_id = seg_row[0]
+
+        per_id: dict[str, dict] = defaultdict(dict)
+        order: list[str] = []
+        for emb_id, key, sv, iv, fv, bv in conn.execute(
+            """
+            SELECT e.embedding_id, em.key, em.string_value, em.int_value,
+                   em.float_value, em.bool_value
+            FROM embedding_metadata em
+            JOIN embeddings e ON em.id = e.id
+            WHERE e.segment_id = ?
+            ORDER BY em.id
+            """,
+            (segment_id,),
+        ):
+            if emb_id not in per_id:
+                order.append(emb_id)
+            if sv is not None:
+                per_id[emb_id][key] = sv
+            elif iv is not None:
+                per_id[emb_id][key] = iv
+            elif fv is not None:
+                per_id[emb_id][key] = fv
+            elif bv is not None:
+                per_id[emb_id][key] = bool(bv)
+
+        for emb_id in order:
+            kv = per_id[emb_id]
+            doc = kv.pop("chroma:document", "")
+            yield emb_id, doc, kv
+    finally:
+        conn.close()
+
+
+def rebuild_from_sqlite(
+    source_palace: str,
+    dest_palace: str,
+    *,
+    archive_existing_dest: bool = False,
+    batch_size: int = 1000,
+) -> dict[str, int]:
+    """Rebuild a palace by reading drawers from ``source_palace``'s
+    ``chroma.sqlite3`` and upserting them into a fresh palace at
+    ``dest_palace``.
+
+    Recovery path for the #1308 failure mode: the chromadb client raises
+    ``InternalError: Failed to apply logs to the hnsw segment writer``
+    on every operation that touches the index (``count``, ``get``,
+    ``query``), but the underlying SQLite tables are intact. Both the
+    legacy ``rebuild_index`` and the inline ``cli.cmd_repair`` path call
+    ``Collection.count()`` as their first read — exactly the call that
+    fails — so neither can recover this class of corruption. This
+    function bypasses the chromadb read path entirely via
+    :func:`extract_via_sqlite`.
+
+    Re-embeds documents at upsert time using the configured embedding
+    function; the original HNSW vectors are not preserved (they live in
+    the corrupt ``data_level0.bin`` / ``link_lists.bin``, not in
+    SQLite). Acceptable for a corruption-recovery flow because the
+    embedding model is deterministic — same model + same document text
+    yields semantically equivalent search results.
+
+    ``archive_existing_dest`` controls behavior when ``dest_palace``
+    already exists:
+
+    * ``False`` (default) — refuse with a clear message. Callers must
+      manually move the existing palace aside first.
+    * ``True`` — rename ``dest_palace`` to
+      ``<dest_palace>.pre-rebuild-<timestamp>`` and read from there
+      instead. Used by the in-place CLI flow where ``--source`` defaults
+      to the same path as ``--palace``.
+
+    Returns a ``{collection_name: row_count}`` dict so callers (CLI,
+    tests) can verify the per-collection rebuild count without parsing
+    stdout. Returns ``{}`` on validation failures (missing source,
+    refusing to overwrite). Raises :class:`RebuildPartialError` if a
+    chromadb upsert fails partway through; the dest palace is left in
+    place so the user can inspect what landed, and the in-place archive
+    (when applicable) is reported in the error so the user can re-run
+    against it.
+
+    .. warning::
+
+       In-place mode (``source_palace == dest_palace`` with
+       ``archive_existing_dest=True``) calls
+       ``chromadb.api.client.SharedSystemClient.clear_system_cache()`` to
+       drop chromadb's process-wide System registry — required because
+       an existing cached System built against the original palace will
+       refuse ``create_collection`` after the dir is renamed (chromadb
+       still thinks the collections exist). This invalidates any
+       PersistentClient instances held elsewhere in the same process for
+       *any* palace, not just this one. Do not call this function from
+       inside a long-running mempalace process (MCP server, daemon)
+       while other callers hold live ``PersistentClient`` references —
+       use the CLI in a separate process instead. Cross-palace use
+       (``source != dest``) does not touch the cache.
+
+    Note on metadata fidelity: the resolution rule
+    (``string_value`` → ``int_value`` → ``float_value`` → ``bool_value``)
+    matches the precedent in :mod:`mempalace.migrate`. ChromaDB 0.4.x
+    occasionally wrote booleans as ``int_value=0/1``; those will
+    round-trip as ``int`` rather than ``bool`` after this rebuild. This
+    is a known divergence and matches the existing migrate-path
+    behavior.
+    """
+    source_palace = os.path.abspath(os.path.expanduser(source_palace))
+    dest_palace = os.path.abspath(os.path.expanduser(dest_palace))
+
+    src_db = os.path.join(source_palace, "chroma.sqlite3")
+
+    in_place = source_palace == dest_palace
+
+    print(f"\n{'=' * 55}")
+    print("  MemPalace Repair — Rebuild from SQLite")
+    print(f"{'=' * 55}\n")
+    print(f"  Source: {source_palace}")
+    print(f"  Dest:   {dest_palace}")
+
+    # Validate source BEFORE any destructive moves. An earlier draft
+    # archived the dest first and surfaced the missing-chroma.sqlite3
+    # error after — leaving the user with a renamed dir to manually undo
+    # when the archive itself was empty. Validate first so a user error
+    # (--source pointing at a non-palace dir) bails cleanly.
+    if in_place:
+        if not archive_existing_dest:
+            print(
+                "\n  Source and dest are the same path. Pass "
+                "archive_existing_dest=True (CLI: --archive-existing) to move "
+                "the existing palace aside, or pass a different source_palace= "
+                "(CLI: --source)."
+            )
+            return {}
+        if not os.path.isfile(src_db):
+            print(f"\n  Source palace has no chroma.sqlite3 at {src_db}")
+            return {}
+    else:
+        if not os.path.isfile(src_db):
+            print(f"\n  Source palace has no chroma.sqlite3 at {src_db}")
+            return {}
+        if os.path.exists(dest_palace):
+            print(
+                f"\n  Refusing to rebuild into existing path: {dest_palace}\n"
+                "  Move it aside, pass a different dest, or set "
+                "archive_existing_dest=True if rebuilding in place "
+                "(source_palace == dest_palace)."
+            )
+            return {}
+
+    archive_path: Optional[str] = None
+    if in_place:
+        ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+        archive_path = f"{dest_palace}.pre-rebuild-{ts}"
+        print(f"  Archiving {dest_palace} → {archive_path}")
+        shutil.move(dest_palace, archive_path)
+        source_palace = archive_path
+        src_db = os.path.join(source_palace, "chroma.sqlite3")
+
+        # In-place only: drop chromadb's process-wide System registry so
+        # the new client at dest_palace builds a fresh System. Without
+        # this, ``create_collection`` raises "Collection already exists"
+        # because the cached System still holds the pre-rename schema.
+        # Cross-palace mode does not need this and would needlessly
+        # invalidate other callers' clients (see docstring warning).
+        try:
+            from chromadb.api.client import SharedSystemClient
+
+            SharedSystemClient.clear_system_cache()
+        except Exception as exc:  # noqa: BLE001
+            print(
+                f"  Warning: could not clear chromadb system cache ({exc!r}); "
+                "in-place rebuild may fail with 'Collection already exists'."
+            )
+
+    os.makedirs(dest_palace, exist_ok=True)
+
+    backend = ChromaBackend()
+    counts: dict[str, int] = {}
+
+    for cname in RECOVERABLE_COLLECTIONS:
+        print(f"\n  [{cname}]")
+        upserted = _rebuild_one_collection(
+            backend=backend,
+            source_palace=source_palace,
+            dest_palace=dest_palace,
+            collection_name=cname,
+            batch_size=batch_size,
+            archive_path=archive_path,
+            counts_so_far=counts,
+        )
+        counts[cname] = upserted
+        if upserted == 0:
+            print(f"    no rows found for {cname} in source palace")
+        else:
+            print(f"    done: {upserted} rows in {cname}")
+
+    print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
+    if archive_path is not None:
+        print(f"  Original palace archived at: {archive_path}")
+    print(f"{'=' * 55}\n")
+    return counts
+
+
 def status(palace_path=None) -> dict:
     """Read-only health check: compare sqlite vs HNSW element counts.
 
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..35e6a44 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -682,3 +682,294 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
     # A backup file is still present — caller can roll back from it.
     leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
     assert leftover
+
+
+# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
+#
+# These tests build real chromadb palaces in tmp_path rather than mocking
+# the SQLite layer. The bug class they guard against is "extraction sees
+# different rows than chromadb stored" — the only honest check is to let
+# chromadb actually write rows and then read them back via the SQLite
+# bypass. Mocking the SQLite cursor would defeat the test.
+
+
+def _seed_palace(palace_path, collection_name, rows):
+    """Build a real chromadb palace at ``palace_path`` and add ``rows``.
+
+    ``rows`` is a list of ``(id, document, metadata)`` tuples. Returns
+    the populated collection so callers can assert on the writer's view
+    of state before the SQLite read.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    backend = ChromaBackend()
+    col = backend.create_collection(str(palace_path), collection_name)
+    col.upsert(
+        ids=[r[0] for r in rows],
+        documents=[r[1] for r in rows],
+        metadatas=[r[2] for r in rows],
+    )
+    return col
+
+
+def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
+    """Round-trip: a chromadb palace with N upserted rows returns those
+    same N rows when read via the SQLite bypass.
+
+    Catches: anyone who breaks the segments/embeddings/embedding_metadata
+    JOIN, swaps the metadata vs vector segment, or changes how the
+    document is stored under the ``chroma:document`` key.
+    """
+    rows = [
+        (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
+        for i in range(25)
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+
+    assert len(extracted) == 25
+    by_id = {emb_id: (doc, meta) for emb_id, doc, meta in extracted}
+    assert set(by_id) == {r[0] for r in rows}
+    for emb_id, doc, meta in rows:
+        got_doc, got_meta = by_id[emb_id]
+        assert got_doc == doc, f"document mangled for {emb_id}"
+        assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
+
+
+def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
+    """Chromadb stores int / float / bool / string in distinct typed
+    columns. Extraction must round-trip the original type, not coerce
+    everything to string.
+
+    Catches: a regression where the SELECT order changes and ints come
+    back as None, or where the column-resolution rule prefers the wrong
+    column.
+    """
+    rows = [
+        (
+            "drawer_typed",
+            "doc",
+            {
+                "wing": "w",
+                "chunk_index": 7,  # int
+                "score": 0.42,  # float
+                "is_active": True,  # bool
+            },
+        ),
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+    assert len(extracted) == 1
+    _, _, meta = extracted[0]
+
+    assert meta["chunk_index"] == 7 and isinstance(meta["chunk_index"], int)
+    assert meta["score"] == 0.42 and isinstance(meta["score"], float)
+    assert meta["is_active"] is True
+    assert meta["wing"] == "w"
+
+
+def test_extract_via_sqlite_unknown_collection_yields_nothing(tmp_path):
+    """Asking for a collection that isn't in the palace must return an
+    empty iterator, not silently fall back to another collection's
+    metadata segment. Seeds two real collections and queries for a third
+    name so a regression that drops the WHERE c.name=? filter would leak
+    rows from the seeded collections rather than passing.
+    """
+    _seed_palace(tmp_path, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    _seed_palace(tmp_path, "mempalace_closets", [("c1", "abbrev", {"wing": "w"})])
+    assert list(repair.extract_via_sqlite(str(tmp_path), "not_a_real_collection")) == []
+
+
+def test_extract_via_sqlite_missing_palace_yields_nothing(tmp_path):
+    """No chroma.sqlite3 → empty iterator, no exception. Callers depend
+    on this when probing speculatively."""
+    empty = tmp_path / "no_palace_here"
+    empty.mkdir()
+    assert list(repair.extract_via_sqlite(str(empty), "mempalace_drawers")) == []
+
+
+def test_rebuild_from_sqlite_roundtrips_via_real_chromadb(tmp_path):
+    """End-to-end: seed source palace, rebuild into a fresh dest, then
+    open dest with a fresh ChromaBackend and verify ``count()`` and
+    metadata filters return the original rows. Also asserts a closet
+    document round-trips so a future regression that re-embeds with the
+    wrong EF or swaps drawer/closet content would fail here.
+
+    This is the single most important regression guard. If
+    ``rebuild_from_sqlite`` silently drops rows or mangles metadata, no
+    other test in this file would catch it because they all stop at the
+    extraction layer.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    rows = [
+        (f"drawer_{i:03d}", f"body {i}", {"wing": "alpha" if i % 2 else "beta", "room": "r0"})
+        for i in range(40)
+    ]
+    _seed_palace(source, "mempalace_drawers", rows)
+    _seed_palace(
+        source,
+        "mempalace_closets",
+        [("closet_x", "abbrev pointer →drawer_001", {"wing": "alpha"})],
+    )
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {"mempalace_drawers": 40, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    drawers = backend.get_collection(str(dest), "mempalace_drawers")
+    assert drawers.count() == 40
+    alpha = drawers.get(where={"wing": "alpha"})
+    assert len(alpha["ids"]) == 20
+
+    # Spot-check that document text round-trips for one specific drawer
+    # — protects against a regression where extraction or upsert order
+    # silently swaps document bodies between IDs.
+    one = drawers.get(ids=["drawer_007"], include=["documents", "metadatas"])
+    assert one["documents"] == ["body 7"]
+    assert one["metadatas"][0]["wing"] == "alpha"
+
+    # Closets: the AAAK index layer. Re-embedded with the same EF so a
+    # known closet ID and its document body must come back intact.
+    closets = backend.get_collection(str(dest), "mempalace_closets")
+    assert closets.count() == 1
+    closet_row = closets.get(ids=["closet_x"], include=["documents", "metadatas"])
+    assert closet_row["documents"] == ["abbrev pointer →drawer_001"]
+    assert closet_row["metadatas"][0] == {"wing": "alpha"}
+
+
+def test_rebuild_from_sqlite_refuses_existing_dest(tmp_path):
+    """Refuse to write into a directory that already exists when source
+    and dest differ. Without this, an unattended re-run would silently
+    interleave a partial rebuild with whatever's already at dest.
+    """
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+    _seed_palace(source, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    dest.mkdir()
+    # Drop a marker file so we can prove the dir wasn't touched.
+    (dest / "marker.txt").write_text("preexisting")
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert (dest / "marker.txt").read_text() == "preexisting"
+    assert not (dest / "chroma.sqlite3").exists()
+
+
+def test_rebuild_from_sqlite_in_place_archives_when_opted_in(tmp_path):
+    """In-place rebuild (source == dest) with ``archive_existing_dest=True``
+    must move the original aside to ``<dest>.pre-rebuild-<ts>`` and read
+    from the archive — the original drawer rows must survive in the new
+    palace, AND the archive itself must still contain the original rows.
+
+    Catches: a refactor that moves the original out but then reads from
+    the now-empty original location, producing an empty rebuild; also
+    catches a swap that empties the archive after reading.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(15)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts["mempalace_drawers"] == 15
+
+    archives = [p for p in tmp_path.iterdir() if p.name.startswith("palace.pre-rebuild-")]
+    assert len(archives) == 1
+    assert (archives[0] / "chroma.sqlite3").exists()
+    # Archive must still hold the same row count via the SQLite bypass —
+    # proves the archive wasn't silently truncated as a side effect.
+    archived_rows = list(repair.extract_via_sqlite(str(archives[0]), "mempalace_drawers"))
+    assert len(archived_rows) == 15
+
+    from mempalace.backends.chroma import ChromaBackend
+
+    rebuilt = ChromaBackend().get_collection(str(palace), "mempalace_drawers")
+    assert rebuilt.count() == 15
+
+
+def test_rebuild_from_sqlite_in_place_refuses_without_archive_flag(tmp_path):
+    """Source == dest without archive flag must abort untouched. The
+    most catastrophic possible regression of this code path is silently
+    deleting the only copy of the user's data."""
+    palace = tmp_path / "palace"
+    _seed_palace(palace, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    sqlite_before = (palace / "chroma.sqlite3").stat().st_size
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace))
+    assert counts == {}
+    # Same file, untouched.
+    assert (palace / "chroma.sqlite3").stat().st_size == sqlite_before
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_source_missing_chroma_db(tmp_path):
+    """Source dir exists but has no chroma.sqlite3 → returns empty,
+    leaves dest untouched."""
+    source = tmp_path / "source"
+    source.mkdir()
+    (source / "stray_file").write_text("not a palace")
+    dest = tmp_path / "dest"
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert not dest.exists()
+
+
+def test_rebuild_from_sqlite_in_place_validates_source_before_archiving(tmp_path):
+    """In-place + archive_existing_dest=True with a dir that lacks
+    chroma.sqlite3 must NOT rename the dir before bailing. An earlier
+    revision archived first and validated second, leaving the user with
+    a renamed empty dir to manually undo. Catches that ordering bug.
+    """
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "marker.txt").write_text("not a real palace")
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts == {}
+    # No archive created — original dir still in place with its marker.
+    assert palace.exists()
+    assert (palace / "marker.txt").read_text() == "not a real palace"
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
+    """Mid-batch upsert failure must raise ``RebuildPartialError`` and
+    surface the failed collection + archive path so the user can recover.
+    Without this, an unattended script gets exit-code-zero on a partial
+    rebuild and the user discovers the data loss only when search starts
+    returning fewer hits.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(5)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    # Make the very first upsert raise so we don't depend on batch
+    # boundary behavior. Patching ChromaCollection.upsert (the wrapper
+    # mempalace's backend returns) keeps the failure path realistic.
+    # ``monkeypatch`` is pytest's built-in fixture that auto-restores
+    # the original attribute when the test exits, so we don't need to
+    # undo this manually.
+    from mempalace.backends.chroma import ChromaCollection
+
+    def boom(self, **kwargs):
+        raise RuntimeError("simulated chromadb upsert failure")
+
+    monkeypatch.setattr(ChromaCollection, "upsert", boom)
+
+    with pytest.raises(repair.RebuildPartialError) as excinfo:
+        repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+
+    err = excinfo.value
+    assert err.failed_collection == "mempalace_drawers"
+    assert err.partial_counts.get("mempalace_drawers") == 0
+    assert err.archive_path is not None
+    assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
+    assert err.dest_palace == os.path.abspath(str(palace))

From cb6bfd523149f976ae10264c80a8dd4ff733b25c Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 12:11:52 -0500
Subject: [PATCH 50/60] chore: gitignore .envrc for direnv users

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1619ba8..ba8ec10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ venv/
 
 # ChromaDB local data
 *.sqlite3-journal
+.envrc

From d92c741084c7a3d276ecd31c483931ff468bdd35 Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 12:12:08 -0500
Subject: [PATCH 51/60] fix(repair): address PR #1310 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five small hardening fixes for the from-sqlite rebuild path, all from
mjc's review on #1310:

- repair.py: drawers collection name now resolves from
  MempalaceConfig().collection_name via _drawers_collection_name() (closets
  stays fixed by design — AAAK index references drawer IDs by string).
  Lines up with the broader configured-collection work in #1312 so that
  PR can rebase cleanly on top.
- repair.py: create_collection() moved inside the try block in
  _rebuild_one_collection so a Chroma "Collection already exists" failure
  surfaces as RebuildPartialError with archive_path, not an unstructured
  exception that strands the user without recovery instructions.
- repair.py: rebuild_from_sqlite wraps backend lifetime in try/finally
  with backend.close() so PersistentClient handles to dest_palace are
  released on every exit path. The from-sqlite path post-dates #1285's
  lifecycle hardening of the legacy rebuild, so this needed its own
  cleanup.
- cli.py: cmd_repair (from-sqlite mode) now exits non-zero when
  rebuild_from_sqlite returns {} (validation refusal sentinel), so
  unattended scripts/CI distinguish "invalid inputs" from a successful
  rebuild that legitimately found zero rows.
- tests/test_repair.py: test_extract_via_sqlite_returns_all_rows_with_metadata
  now asserts every backing segment is scope='METADATA', locking in the
  segment-layout assumption against future regressions that point the
  JOIN at the VECTOR segment.

New test coverage:
- test_rebuild_from_sqlite_honors_configured_drawer_collection_name
- test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero
- test_cmd_repair_from_sqlite_success_does_not_exit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/cli.py     |  11 +++-
 mempalace/repair.py  | 121 ++++++++++++++++++++++++++++++++-----------
 tests/test_cli.py    |  61 ++++++++++++++++++++++
 tests/test_repair.py |  89 +++++++++++++++++++++++++++++++
 4 files changed, 250 insertions(+), 32 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 468b765..95915eb 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -699,7 +699,7 @@ def cmd_repair(args):
             return
 
         try:
-            rebuild_from_sqlite(
+            counts = rebuild_from_sqlite(
                 source_palace=source_path,
                 dest_palace=palace_path,
                 archive_existing_dest=archive_existing,
@@ -713,6 +713,15 @@ def cmd_repair(args):
                 f"Failed in collection: {exc.failed_collection}"
             )
             sys.exit(1)
+        # An empty counts dict is rebuild_from_sqlite's documented signal
+        # for a validation refusal (missing source, existing dest,
+        # in-place without --archive-existing). The library already
+        # printed an actionable message; exit non-zero so unattended
+        # scripts/CI distinguish "invalid inputs" from a successful
+        # rebuild that legitimately found zero rows (which still returns
+        # a populated dict with 0-valued counts).
+        if not counts:
+            sys.exit(1)
         return
 
     db_path = os.path.join(palace_path, "chroma.sqlite3")
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 7e98f0f..90fb0cf 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -43,11 +43,46 @@ from .backends.chroma import ChromaBackend, hnsw_capacity_status
 
 COLLECTION_NAME = "mempalace_drawers"
 
-# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
-# upsert pass — drawers carry the bulk of the data, closets are the AAAK
-# index layer and reference drawer IDs by string in their documents (no
-# foreign-key validation, so ordering is informational, not load-bearing).
-RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
+# The closets collection (AAAK index layer) is intentionally fixed —
+# closets reference drawer IDs by string and live alongside drawers in the
+# same palace; renaming the closets collection per-deployment would break
+# cross-palace AAAK lookups. Drawer collection name comes from config
+# (see ``_recoverable_collections``).
+CLOSETS_COLLECTION_NAME = "mempalace_closets"
+
+
+def _drawers_collection_name() -> str:
+    """Resolve the drawers collection name from user config, falling back
+    to the module default ``COLLECTION_NAME`` if config is unreadable.
+
+    Recovery flows must honor ``MempalaceConfig().collection_name`` so a
+    user with a non-default drawer collection (e.g. multi-palace setups)
+    rebuilds the right rows. Closets remain fixed — see
+    ``CLOSETS_COLLECTION_NAME``.
+    """
+    try:
+        from .config import MempalaceConfig
+
+        return MempalaceConfig().collection_name or COLLECTION_NAME
+    except Exception:
+        return COLLECTION_NAME
+
+
+def _recoverable_collections() -> tuple[str, ...]:
+    """Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
+
+    Drawers first (bulk data), then closets (AAAK index layer that
+    references drawer IDs by string in their documents — no
+    foreign-key validation, so ordering is informational, not
+    load-bearing).
+    """
+    return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
+
+
+# Back-compat alias for callers that imported the constant. New code
+# should call ``_recoverable_collections()`` so config changes are picked
+# up at call time.
+RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
 
 
 def _get_palace_path():
@@ -487,12 +522,11 @@ def _rebuild_one_collection(
     caller can stop the loop and print recovery instructions instead of
     silently shipping a partial palace.
     """
-    col = backend.create_collection(dest_palace, collection_name)
-
     ids: list[str] = []
     docs: list[str] = []
     metas: list[dict] = []
     upserted = 0
+    col = None
 
     def _flush() -> int:
         nonlocal upserted
@@ -507,6 +541,14 @@ def _rebuild_one_collection(
         return upserted
 
     try:
+        # ``create_collection`` lives inside the try so a Chroma-side
+        # "Collection already exists" failure (which can happen when the
+        # process-wide System cache still holds a pre-archive schema) is
+        # reported as a structured ``RebuildPartialError`` carrying
+        # ``archive_path`` — instead of an unstructured exception that
+        # strands the user without recovery instructions.
+        col = backend.create_collection(dest_palace, collection_name)
+
         for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
             ids.append(emb_id)
             docs.append(doc or "")
@@ -664,8 +706,14 @@ def rebuild_from_sqlite(
 
     Returns a ``{collection_name: row_count}`` dict so callers (CLI,
     tests) can verify the per-collection rebuild count without parsing
-    stdout. Returns ``{}`` on validation failures (missing source,
-    refusing to overwrite). Raises :class:`RebuildPartialError` if a
+    stdout. A successful rebuild always returns a dict with one key per
+    recoverable collection (values may be ``0`` when a collection is
+    legitimately empty in the source). The empty dict ``{}`` is reserved
+    for validation refusals (missing source DB, refusing to overwrite an
+    existing dest, in-place mode without ``archive_existing_dest``); CLI
+    callers should treat ``{}`` as an error and exit non-zero so CI and
+    scripts can distinguish "invalid inputs" from "successful recovery
+    that found zero rows." Raises :class:`RebuildPartialError` if a
     chromadb upsert fails partway through; the dest palace is left in
     place so the user can inspect what landed, and the in-place archive
     (when applicable) is reported in the error so the user can re-run
@@ -765,31 +813,42 @@ def rebuild_from_sqlite(
 
     os.makedirs(dest_palace, exist_ok=True)
 
+    # Backend lifetime is wrapped in try/finally so the dest palace's
+    # PersistentClient handle (opened lazily inside ``create_collection``
+    # / ``get_collection``) is released on every exit path: success,
+    # ``RebuildPartialError``, or any unexpected exception. Without this,
+    # a long-running process that calls ``rebuild_from_sqlite`` would
+    # leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
+    # cache, surfacing later as "Collection already exists" on the next
+    # in-place rebuild or as a Windows file-lock failure on cleanup
+    # (cf. #1285's lifecycle hardening for the legacy rebuild path).
     backend = ChromaBackend()
     counts: dict[str, int] = {}
+    try:
+        for cname in _recoverable_collections():
+            print(f"\n  [{cname}]")
+            upserted = _rebuild_one_collection(
+                backend=backend,
+                source_palace=source_palace,
+                dest_palace=dest_palace,
+                collection_name=cname,
+                batch_size=batch_size,
+                archive_path=archive_path,
+                counts_so_far=counts,
+            )
+            counts[cname] = upserted
+            if upserted == 0:
+                print(f"    no rows found for {cname} in source palace")
+            else:
+                print(f"    done: {upserted} rows in {cname}")
 
-    for cname in RECOVERABLE_COLLECTIONS:
-        print(f"\n  [{cname}]")
-        upserted = _rebuild_one_collection(
-            backend=backend,
-            source_palace=source_palace,
-            dest_palace=dest_palace,
-            collection_name=cname,
-            batch_size=batch_size,
-            archive_path=archive_path,
-            counts_so_far=counts,
-        )
-        counts[cname] = upserted
-        if upserted == 0:
-            print(f"    no rows found for {cname} in source palace")
-        else:
-            print(f"    done: {upserted} rows in {cname}")
-
-    print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
-    if archive_path is not None:
-        print(f"  Original palace archived at: {archive_path}")
-    print(f"{'=' * 55}\n")
-    return counts
+        print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
+        if archive_path is not None:
+            print(f"  Original palace archived at: {archive_path}")
+        print(f"{'=' * 55}\n")
+        return counts
+    finally:
+        backend.close()
 
 
 def status(palace_path=None) -> dict:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6b4b7b3..71ca63d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1097,3 +1097,64 @@ def test_reconfigure_stdio_is_noop_off_windows():
         _reconfigure_stdio_utf8_on_windows()
 
     assert stdin.reconfigure_calls == []
+
+
+# ── cmd_repair: from-sqlite mode exit codes ──────────────────────────
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys):
+    """When ``rebuild_from_sqlite`` returns ``{}`` for a validation
+    refusal (missing source DB, in-place without --archive-existing,
+    refusing to overwrite an existing dest), the CLI must surface a
+    non-zero exit so unattended scripts and CI distinguish "invalid
+    inputs" from "successful recovery that found zero rows."
+
+    Catches: a regression where the CLI treats the validation-refusal
+    sentinel as success, leaving CI green on a no-op repair that should
+    have alerted an operator.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value={}):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    assert excinfo.value.code == 1
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path):
+    """A successful from-sqlite rebuild — even one that finds zero rows
+    in a legitimately empty source palace — must NOT call ``sys.exit``.
+    A populated counts dict (with ``0`` values) is the success signal;
+    only the empty dict ``{}`` is reserved for validation refusal.
+
+    Catches: a regression where ``if not counts`` is replaced by
+    ``if not sum(counts.values())`` or similar, conflating "empty source"
+    with "validation refused" and breaking idempotent recovery scripts.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    # Zero rows but per-collection keys present → success, no exit.
+    fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0}
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts):
+        # Should return cleanly; no SystemExit raised.
+        cmd_repair(args)
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 35e6a44..8ca72fb 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -719,6 +719,13 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
     Catches: anyone who breaks the segments/embeddings/embedding_metadata
     JOIN, swaps the metadata vs vector segment, or changes how the
     document is stored under the ``chroma:document`` key.
+
+    Also asserts every embedding row underlying the extraction lives in
+    a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
+    are stored under METADATA in Chroma's segment layout while HNSW
+    files live under ``VECTOR``; locking that assumption in here means a
+    future refactor that accidentally points the JOIN at ``VECTOR``
+    fails this test instead of silently regressing the recovery path.
     """
     rows = [
         (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
@@ -736,6 +743,35 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
         assert got_doc == doc, f"document mangled for {emb_id}"
         assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
 
+    # Lock the segment-scope assumption directly against Chroma's on-disk
+    # layout so a future change that points the extraction JOIN at the
+    # VECTOR segment cannot pass this test. Query each extracted row's
+    # backing segment scope via the same SQLite tables ``extract_via_sqlite``
+    # reads from.
+    sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
+    conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+    try:
+        scopes = {
+            scope
+            for (scope,) in conn.execute(
+                """
+                SELECT DISTINCT s.scope
+                FROM embeddings e
+                JOIN segments s ON e.segment_id = s.id
+                JOIN collections c ON s.collection = c.id
+                WHERE c.name = ? AND e.embedding_id IN ({})
+                """.format(",".join("?" * len(extracted))),
+                ("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
+            )
+        }
+    finally:
+        conn.close()
+    assert scopes == {"METADATA"}, (
+        f"extraction is reading from segments scoped {scopes!r}; only "
+        "'METADATA' should back the document/metadata rows. If Chroma's "
+        "segment layout changed, update extract_via_sqlite's WHERE clause."
+    )
+
 
 def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
     """Chromadb stores int / float / bool / string in distinct typed
@@ -973,3 +1009,56 @@ def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
     assert err.archive_path is not None
     assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
     assert err.dest_palace == os.path.abspath(str(palace))
+
+
+def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
+    """A user with a non-default drawers collection name (set via
+    ``MempalaceConfig().collection_name``) must have THAT collection
+    rebuilt — not the hardcoded ``mempalace_drawers``.
+
+    Catches: a regression where the recovery path silently rebuilds the
+    default-name collection on a custom-named palace, leaving the user's
+    actual data unrebuilt while reporting "rebuild complete." This is
+    the failure mode reviewer mjc flagged on PR #1310 as needing to line
+    up with the configured-collection-name work in #1312. Closets stay
+    fixed (``mempalace_closets``) by design — the AAAK index references
+    drawer IDs by string and is not per-deployment configurable.
+
+    Strategy: monkeypatch the lazy resolver so the test is hermetic and
+    does not depend on the global config file or env state.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    custom_drawers = "custom_drawers_xyz"
+    monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
+    closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
+    _seed_palace(source, custom_drawers, drawer_rows)
+    _seed_palace(source, "mempalace_closets", closet_rows)
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+
+    # Rebuilt under the custom name, not under the default "mempalace_drawers".
+    assert counts == {custom_drawers: 3, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
+    assert rebuilt_drawers.count() == 3
+
+    # Default-name collection must NOT exist in dest — proves we did not
+    # silently fall back to the hardcoded name during rebuild.
+    try:
+        rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
+        # If get_collection returns without raising, count() should be 0
+        # (chromadb may auto-create on get with some EFs); a non-zero
+        # count would mean we wrote rows to the wrong collection.
+        assert rebuilt_default.count() == 0, (
+            "rebuild leaked rows into the default-name collection on a "
+            "custom-name palace — recovery wrote to the wrong collection."
+        )
+    except Exception:
+        pass  # Expected: collection wasn't created.

From e334e257bf4bb634cf4f31b4087f84f8b467938b Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 04:52:18 -0300
Subject: [PATCH 52/60] fix(mcp): retry _get_collection once on transient
 failure (#1286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A transient chromadb exception inside `_get_collection` was swallowed by
the bare `except Exception: return None`, leaving every subsequent tool
call hitting the same poisoned cache silently. The fix wraps the body
in a `for attempt in range(2)` loop: on attempt 0 failure, log via
`logger.exception(...)` and clear `_client_cache` / `_collection_cache`
/ `_metadata_cache` so the next iteration forces `_get_client()` to
rebuild from scratch — that path now re-runs `quarantine_stale_hnsw`
(per #1322), so the second attempt heals the common stale-handle case
automatically. If both attempts fail, return `None` (matches the prior
contract for permanent failures).

Two new tests in `tests/test_mcp_server.py::TestCacheInvalidation`:
- `test_get_collection_retries_once_on_exception` — first attempt raises
  via a monkeypatched `_get_client`, second attempt succeeds; assert the
  caller gets the collection back, not None.
- `test_get_collection_returns_none_after_two_failures` — both attempts
  fail, assert we exhaust the loop and return None (no infinite retry).

Surgical extraction from PR #1286, which carried the same fix idea
(plus a fork-sync bundle that couldn't be merged); credit to the
original author below.

Co-authored-by: Jeffrey Hein <jp@jphein.com>
---
 mempalace/mcp_server.py  | 148 +++++++++++++++++++++++----------------
 tests/test_mcp_server.py |  65 +++++++++++++++++
 2 files changed, 152 insertions(+), 61 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 58f9ba9..bbb9c93 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -326,68 +326,94 @@ def _get_client():
 
 
 def _get_collection(create=False):
-    """Return the ChromaDB collection, caching the client between calls."""
-    global _collection_cache, _metadata_cache, _metadata_cache_time
-    try:
-        client = _get_client()
-        # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
-        # collection but not the EF *instance/configuration*. So a reader or
-        # writer that omits ``embedding_function=`` silently gets chromadb's
-        # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
-        # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
-        # the identity check passes), but the *provider list* is chromadb's
-        # default rather than the user's resolved device. On bleeding-edge
-        # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
-        # that default provider selection can SIGSEGV the host process on
-        # first ``col.add()``. The miner / Stop hook ingest path avoids this
-        # because it routes through ``ChromaBackend.get_collection``, which
-        # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
-        # the MCP server bypassed that abstraction. Resolve the EF inside the
-        # branches that actually open a collection so warm-cache reads stay
-        # zero-cost. Reuse the backend helper so the two call sites can't
-        # drift on logging or fallback semantics.
-        if create:
-            ef = ChromaBackend._resolve_embedding_function()
-            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
-            # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
-            # HNSW insert path, which has a race in repairConnectionsForUpdate /
-            # addPoint (see issues #974, #965). Set via metadata on fresh
-            # collections and re-applied via _pin_hnsw_threads() for legacy
-            # palaces whose collections were created before this fix (the
-            # runtime config does not persist cross-process in chromadb 1.5.x,
-            # so the retrofit runs every time _get_collection opens a cache).
-            #
-            # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
-            # is called with metadata that differs from what's stored. The split
-            # below skips the metadata-comparison codepath for existing
-            # collections, mirroring the backend-layer fix from #1262.
-            try:
+    """Return the ChromaDB collection, caching the client between calls.
+
+    On failure, log the exception and retry once after clearing the client
+    and collection caches. Tools were silently returning ``None`` when a
+    cached client/collection went stale — typically after the chromadb
+    rust bindings invalidated a handle following an out-of-band write —
+    leaving the LLM with no diagnostic and no recovery path. The retry
+    forces ``_get_client()`` to rebuild from scratch (which re-runs
+    ``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
+    common stale-handle / stale-HNSW case automatically.
+    """
+    global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
+    for attempt in range(2):
+        try:
+            client = _get_client()
+            # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
+            # collection but not the EF *instance/configuration*. So a reader or
+            # writer that omits ``embedding_function=`` silently gets chromadb's
+            # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
+            # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
+            # the identity check passes), but the *provider list* is chromadb's
+            # default rather than the user's resolved device. On bleeding-edge
+            # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
+            # that default provider selection can SIGSEGV the host process on
+            # first ``col.add()``. The miner / Stop hook ingest path avoids this
+            # because it routes through ``ChromaBackend.get_collection``, which
+            # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
+            # the MCP server bypassed that abstraction. Resolve the EF inside the
+            # branches that actually open a collection so warm-cache reads stay
+            # zero-cost. Reuse the backend helper so the two call sites can't
+            # drift on logging or fallback semantics.
+            if create:
+                ef = ChromaBackend._resolve_embedding_function()
+                ef_kwargs = {"embedding_function": ef} if ef is not None else {}
+                # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
+                # HNSW insert path, which has a race in repairConnectionsForUpdate /
+                # addPoint (see issues #974, #965). Set via metadata on fresh
+                # collections and re-applied via _pin_hnsw_threads() for legacy
+                # palaces whose collections were created before this fix (the
+                # runtime config does not persist cross-process in chromadb 1.5.x,
+                # so the retrofit runs every time _get_collection opens a cache).
+                #
+                # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
+                # is called with metadata that differs from what's stored. The split
+                # below skips the metadata-comparison codepath for existing
+                # collections, mirroring the backend-layer fix from #1262.
+                try:
+                    raw = client.get_collection(_config.collection_name, **ef_kwargs)
+                except _ChromaNotFoundError:
+                    raw = client.create_collection(
+                        _config.collection_name,
+                        metadata={
+                            "hnsw:space": "cosine",
+                            "hnsw:num_threads": 1,
+                            **_HNSW_BLOAT_GUARD,
+                        },
+                        **ef_kwargs,
+                    )
+                _pin_hnsw_threads(raw)
+                _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
+                _metadata_cache = None
+                _metadata_cache_time = 0
+            elif _collection_cache is None:
+                ef = ChromaBackend._resolve_embedding_function()
+                ef_kwargs = {"embedding_function": ef} if ef is not None else {}
                 raw = client.get_collection(_config.collection_name, **ef_kwargs)
-            except _ChromaNotFoundError:
-                raw = client.create_collection(
-                    _config.collection_name,
-                    metadata={
-                        "hnsw:space": "cosine",
-                        "hnsw:num_threads": 1,
-                        **_HNSW_BLOAT_GUARD,
-                    },
-                    **ef_kwargs,
-                )
-            _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
-            _metadata_cache = None
-            _metadata_cache_time = 0
-        elif _collection_cache is None:
-            ef = ChromaBackend._resolve_embedding_function()
-            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
-            raw = client.get_collection(_config.collection_name, **ef_kwargs)
-            _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
-            _metadata_cache = None
-            _metadata_cache_time = 0
-        return _collection_cache
-    except Exception:
-        return None
+                _pin_hnsw_threads(raw)
+                _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
+                _metadata_cache = None
+                _metadata_cache_time = 0
+            return _collection_cache
+        except Exception:
+            logger.exception(
+                "_get_collection attempt %d/2 failed (palace=%s, create=%s)",
+                attempt + 1,
+                _config.palace_path,
+                create,
+            )
+            if attempt == 0:
+                # Reset all caches so the next attempt forces _get_client()
+                # to rebuild the chromadb client from scratch — that path
+                # re-runs quarantine_stale_hnsw (#1322) and reopens the
+                # collection cleanly, healing the common stale-handle case.
+                _client_cache = None
+                _collection_cache = None
+                _metadata_cache = None
+                _metadata_cache_time = 0
+    return None
 
 
 def _no_palace():
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index c073830..ae20bf3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1259,6 +1259,71 @@ class TestCacheInvalidation:
             assert "embedding_function" in kwargs
             assert kwargs["embedding_function"] is not None
 
+    def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
+        """Regression: a transient failure inside _get_collection must trigger
+        one retry after clearing the client/collection caches, not silently
+        return None.
+
+        Before this fix, a stale chromadb handle (e.g. the rust bindings
+        invalidating after an out-of-band write) would raise inside the
+        single ``try`` block, get swallowed by ``except Exception: return
+        None``, and every subsequent tool call would hit the same poisoned
+        cache returning None. The retry forces ``_get_client()`` to rebuild
+        the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
+        the second attempt heals the common stale-handle case.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        # Force a cold cache so the first call goes through the open path.
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        real_get_client = mcp_server._get_client
+        attempts = {"count": 0}
+
+        def flaky_get_client():
+            attempts["count"] += 1
+            if attempts["count"] == 1:
+                raise RuntimeError("simulated transient chromadb failure")
+            return real_get_client()
+
+        monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
+
+        col = mcp_server._get_collection()
+
+        # Both attempts ran and the second succeeded.
+        assert attempts["count"] == 2
+        assert col is not None
+
+    def test_get_collection_returns_none_after_two_failures(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        """If both attempts fail, return None (matches the prior contract for
+        permanent failures — only the transient case is now self-healing)."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        attempts = {"count": 0}
+
+        def always_fails():
+            attempts["count"] += 1
+            raise RuntimeError("permanent chromadb failure")
+
+        monkeypatch.setattr(mcp_server, "_get_client", always_fails)
+
+        col = mcp_server._get_collection()
+
+        assert attempts["count"] == 2
+        assert col is None
+
 
 class TestKGLazyCache:
     """Lazy per-path KnowledgeGraph cache (issue #1136)."""

From bddba59ae3ffe1fab6a281b7090f9945287a98d6 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 12:35:01 -0700
Subject: [PATCH 53/60] docs: add 30-day expiry callout + ship 4 auto-save
 tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a brief [!IMPORTANT] callout at the top of the README pointing
users to the urgent announcement at #1388. Claude Code auto-deletes
local JSONL transcripts after 30 days; users without the auto-save
hooks wired are losing transcript data off the rolling window.

Ships 4 small standalone tools at tools/:
- backup_claude_jsonls.sh — rsync ~/.claude/projects/ to a safe folder
- render_jsonl.py — convert JSONL transcripts to readable text
- find_orphan_claude_jsonls.sh — scan backup locations for orphan
  Claude Code transcripts (multi-line shape detection + topic preview)
- save.md — Claude Code slash command for manual /save into MemPalace

Tools verified by independent agent against v3.3.4 source.
Read-only on user data. POSIX bash + Python stdlib only.
---
 README.md                          |   4 +
 tools/backup_claude_jsonls.sh      |  39 ++++++++++
 tools/find_orphan_claude_jsonls.sh | 115 +++++++++++++++++++++++++++++
 tools/render_jsonl.py              |  71 ++++++++++++++++++
 tools/save.md                      |  26 +++++++
 5 files changed, 255 insertions(+)
 create mode 100755 tools/backup_claude_jsonls.sh
 create mode 100755 tools/find_orphan_claude_jsonls.sh
 create mode 100755 tools/render_jsonl.py
 create mode 100644 tools/save.md

diff --git a/README.md b/README.md
index 8157fca..d82bcd2 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,10 @@
 > domain — including `mempalace.tech` — is an impostor and may distribute
 > malware. Details and timeline: [docs/HISTORY.md](docs/HISTORY.md).
 
+> [!IMPORTANT]
+> **🚨 Claude Code sessions expire in 30 days w/out auto-save hooks wired!** **[Read this →](https://github.com/MemPalace/mempalace/discussions/1388)**
+
+
 <div align="center">
 
 <img src="assets/mempalace_logo.png" alt="MemPalace" width="240">
diff --git a/tools/backup_claude_jsonls.sh b/tools/backup_claude_jsonls.sh
new file mode 100755
index 0000000..f252de0
--- /dev/null
+++ b/tools/backup_claude_jsonls.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# backup_claude_jsonls.sh
+#
+# Claude Code stores every conversation as a JSONL transcript at
+#   ~/.claude/projects/<encoded-project>/<session-uuid>.jsonl
+# Anthropic auto-deletes those files after 30 DAYS:
+#   https://docs.claude.com/en/docs/claude-code/data-usage
+#
+# This script copies them, read-only, into ~/Documents/Claude_JSONL_Backup/
+# so the 30-day clock no longer applies. Re-run any time — rsync is incremental.
+# It NEVER deletes, modifies, or touches files inside ~/.claude/.
+
+set -eu
+
+SRC="${HOME}/.claude/projects/"
+DST="${HOME}/Documents/Claude_JSONL_Backup/"
+
+[ -d "$SRC" ] || { echo "ERROR: $SRC does not exist."; exit 1; }
+mkdir -p "$DST"
+
+echo "Backing up $SRC -> $DST"
+rsync -a --times "$SRC" "$DST"
+
+src_count=$(find "$SRC" -type f -name '*.jsonl' | wc -l | tr -d ' ')
+dst_count=$(find "$DST" -type f -name '*.jsonl' | wc -l | tr -d ' ')
+oldest=$(find "$DST" -type f -name '*.jsonl' -exec stat -f '%Sm %N' -t '%Y-%m-%d' {} \; 2>/dev/null \
+        || find "$DST" -type f -name '*.jsonl' -printf '%TY-%Tm-%Td %p\n' 2>/dev/null)
+oldest_date=$(echo "$oldest" | sort | head -n 1 | awk '{print $1}')
+newest_date=$(echo "$oldest" | sort | tail -n 1 | awk '{print $1}')
+
+echo "Source JSONL count : $src_count"
+echo "Backup JSONL count : $dst_count"
+echo "Oldest backup file : ${oldest_date:-n/a}"
+echo "Newest backup file : ${newest_date:-n/a}"
+
+if [ "$src_count" -ne "$dst_count" ]; then
+  echo "FAIL: count mismatch ($src_count vs $dst_count)"; exit 2
+fi
+echo "OK: backup verified."
diff --git a/tools/find_orphan_claude_jsonls.sh b/tools/find_orphan_claude_jsonls.sh
new file mode 100755
index 0000000..43523f5
--- /dev/null
+++ b/tools/find_orphan_claude_jsonls.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)
+# -----------------------------------------------------------------------------
+# Finds Claude Code conversation transcripts (.jsonl) that may have survived in
+# backup/sync locations. Claude Code stores transcripts at
+# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally
+# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,
+# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those
+# copies still exist. This script finds them and shows a topic preview from
+# the first substantive user message — strips leading filler interjections
+# ("ok so", "oh", "well", "hey") so previews surface the actual content.
+#
+# Read-only. Safe to re-run.
+# -----------------------------------------------------------------------------
+set -eu
+
+LOCATIONS=(
+  "$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"
+  "$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"
+)
+
+TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT
+
+printf "Scanning backup locations" >&2
+for loc in "${LOCATIONS[@]}"; do
+  [ -d "$loc" ] || continue
+  printf "." >&2
+  while IFS= read -r -d '' f; do
+    # Combined: shape detection (multi-line) + verb-aware topic preview
+    if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'
+import json, sys, re
+
+# Single-word/short greetings — message gets skipped entirely if it is just one of these
+GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',
+             'sure','cool','great','good','done','yep','nope','perfect','copy'}
+
+# Leading filler — interjections that get STRIPPED from the start of a message
+# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."
+LEADING_FILLER = re.compile(
+    r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|'
+    r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|'
+    r'yeah|yep|nope|nah)\b[\s,!.?:;-]*',
+    re.IGNORECASE
+)
+
+path = sys.argv[1]
+shape_ok = False
+preview = ""
+try:
+    with open(path, 'r', errors='replace') as fh:
+        for i, line in enumerate(fh):
+            if i >= 30: break
+            try:
+                d = json.loads(line)
+            except Exception:
+                continue
+            if not isinstance(d, dict): continue
+            # Shape check — accept if any line in first 30 has session fields
+            if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:
+                shape_ok = True
+            # Preview — first user message after stripping leading filler
+            if not preview:
+                role = d.get('type', '') or d.get('message', {}).get('role', '')
+                if role == 'user':
+                    content = d.get('message', {}).get('content', '')
+                    if isinstance(content, list):
+                        text = ' '.join(
+                            c.get('text', '') for c in content
+                            if isinstance(c, dict) and c.get('type') == 'text'
+                        )
+                    elif isinstance(content, str):
+                        text = content
+                    else:
+                        text = ''
+                    text = re.sub(r'\s+', ' ', text).strip()
+                    # Skip messages that are pure greetings
+                    if text.lower() in GREETINGS:
+                        continue
+                    # Iteratively strip leading filler tokens until stable
+                    prev_text = None
+                    while prev_text != text:
+                        prev_text = text
+                        text = LEADING_FILLER.sub('', text).strip()
+                    # Skip if what remains is too short
+                    if len(text) < 20:
+                        continue
+                    preview = text[:80] + ('...' if len(text) > 80 else '')
+            if shape_ok and preview: break
+except Exception:
+    pass
+if shape_ok:
+    print(preview if preview else "(no preview — first 30 lines were greetings or short)")
+    sys.exit(0)
+sys.exit(1)
+PYEOF
+)"; then
+      mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)"
+      size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)"
+      printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"
+    fi
+  done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)
+done
+printf "\n" >&2
+
+count=$(wc -l <"$TMP" | tr -d ' ')
+if [ "$count" -eq 0 ]; then
+  echo "No orphan Claude Code transcripts found in scanned backup locations."
+  exit 0
+fi
+sort -k1,1 "$TMP" >"$TMP.s"
+oldest="$(head -n 1 "$TMP.s" | cut -f1)"
+newest="$(tail -n 1 "$TMP.s" | cut -f1)"
+echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest  Newest: $newest"
+echo "----------------------------------------------------------------------"
+awk -F'\t' '{ printf "%s  %10s  %s\n              \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"
diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
new file mode 100755
index 0000000..3d74c00
--- /dev/null
+++ b/tools/render_jsonl.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""render_jsonl.py — turn one Claude Code JSONL transcript into readable text.
+
+Claude Code stores conversations at ~/.claude/projects/<proj>/<uuid>.jsonl and
+Anthropic auto-deletes them after 30 days
+(https://docs.claude.com/en/docs/claude-code/data-usage). This script renders a
+JSONL into a clean .txt so you can keep / read / share it without the tooling.
+
+Usage:
+    python3 render_jsonl.py <input.jsonl> [output.txt]
+
+Stdlib only. Python 3.9+. Read-only on the input.
+"""
+import json, sys
+from pathlib import Path
+
+def extract_text(content):
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        parts = []
+        for blk in content:
+            if isinstance(blk, dict) and blk.get("type") == "text":
+                t = (blk.get("text") or "").strip()
+                if t:
+                    parts.append(t)
+        return "\n".join(parts)
+    return ""
+
+def main():
+    if len(sys.argv) < 2:
+        print(__doc__); sys.exit(1)
+    src = Path(sys.argv[1])
+    if not src.is_file():
+        print(f"ERROR: not a file: {src}"); sys.exit(1)
+    out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout
+
+    turns, stamps = [], []
+    for raw in src.read_text(encoding="utf-8", errors="replace").splitlines():
+        if not raw.strip():
+            continue
+        try:
+            obj = json.loads(raw)
+        except json.JSONDecodeError:
+            continue
+        role = obj.get("type") or (obj.get("message") or {}).get("role")
+        if role not in ("user", "assistant"):
+            continue
+        msg = obj.get("message") or obj
+        text = extract_text(msg.get("content"))
+        if not text:
+            continue
+        ts = obj.get("timestamp") or ""
+        if ts: stamps.append(ts)
+        turns.append((ts, role, text))
+
+    header = [
+        f"# Claude Code transcript: {src}",
+        f"# Total turns: {len(turns)}",
+        f"# Date range : {min(stamps) if stamps else 'n/a'}  ->  {max(stamps) if stamps else 'n/a'}",
+        "#" + "-" * 70, "",
+    ]
+    out.write("\n".join(header))
+    for ts, role, text in turns:
+        out.write(f"\n[{ts}] {role.upper()}\n{text}\n\n{'-'*72}\n")
+    if out is not sys.stdout:
+        out.close()
+        print(f"Wrote {len(turns)} turns to {sys.argv[2]}")
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/save.md b/tools/save.md
new file mode 100644
index 0000000..914156b
--- /dev/null
+++ b/tools/save.md
@@ -0,0 +1,26 @@
+---
+description: Save the current Claude Code session into MemPalace. Idempotent — won't dupe.
+---
+
+# /save
+
+Save the current Claude Code session into MemPalace. Run this when you
+want a checkpoint. Safe to run repeatedly — drawer IDs are content-hashed
+so re-running on the same session overwrites in place, no duplicates.
+
+Behavior:
+
+1. Find the current session's JSONL transcript path (Claude Code passes
+   it via the conversation context — look for `~/.claude/projects/` paths).
+2. Run via bash:
+
+   ```
+   mempalace mine "<TRANSCRIPT_PATH>" --mode convos --wing claude_imports
+   ```
+
+3. If the user supplied an argument after `/save`, use it as the wing name
+   instead of `claude_imports` (e.g. `/save my_research` →
+   `--wing my_research`).
+4. Report back: how many drawers were filed, into which wing/room.
+
+Requires `mempalace` to be installed (`pip install mempalace`).

From 921ff5a6faf753130ee5b6e9666daf6eec2bfc65 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 15:39:08 -0700
Subject: [PATCH 54/60] fix(tools/render_jsonl): split chained statements per
 ruff 0.4.x

Addresses CI lint feedback on PR #1391. No behavior change.
- Split `import json, sys` into separate lines (E401)
- Split chained `print(...); sys.exit(1)` into two lines (E702, two occurrences)
- Split inline `if ts: stamps.append(ts)` into two lines (E701)

Verified: `ruff check tools/render_jsonl.py` reports "All checks passed!"
Tool still renders correctly (3 turns from a real JSONL test, identical output to pre-fix).
---
 tools/render_jsonl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
index 3d74c00..3372ee1 100755
--- a/tools/render_jsonl.py
+++ b/tools/render_jsonl.py
@@ -11,7 +11,8 @@ Usage:
 
 Stdlib only. Python 3.9+. Read-only on the input.
 """
-import json, sys
+import json
+import sys
 from pathlib import Path
 
 def extract_text(content):
@@ -29,10 +30,12 @@ def extract_text(content):
 
 def main():
     if len(sys.argv) < 2:
-        print(__doc__); sys.exit(1)
+        print(__doc__)
+        sys.exit(1)
     src = Path(sys.argv[1])
     if not src.is_file():
-        print(f"ERROR: not a file: {src}"); sys.exit(1)
+        print(f"ERROR: not a file: {src}")
+        sys.exit(1)
     out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout
 
     turns, stamps = [], []
@@ -51,7 +54,8 @@ def main():
         if not text:
             continue
         ts = obj.get("timestamp") or ""
-        if ts: stamps.append(ts)
+        if ts:
+            stamps.append(ts)
         turns.append((ts, role, text))
 
     header = [

From 7c679ba6250fd8cc57af24a22ce9e19b67b20429 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 16:12:34 -0700
Subject: [PATCH 55/60] fix(tools/render_jsonl): apply ruff format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier commit fixed ruff lint but missed the formatter check.
This applies `ruff format` — adds standard PEP8 blank lines between
functions, splits one inline list. No behavior change.

Verified: both `ruff format --check` and `ruff check` pass cleanly.
Tool still renders correctly.
---
 tools/render_jsonl.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
index 3372ee1..2bec0da 100755
--- a/tools/render_jsonl.py
+++ b/tools/render_jsonl.py
@@ -11,10 +11,12 @@ Usage:
 
 Stdlib only. Python 3.9+. Read-only on the input.
 """
+
 import json
 import sys
 from pathlib import Path
 
+
 def extract_text(content):
     if isinstance(content, str):
         return content.strip()
@@ -28,6 +30,7 @@ def extract_text(content):
         return "\n".join(parts)
     return ""
 
+
 def main():
     if len(sys.argv) < 2:
         print(__doc__)
@@ -62,7 +65,8 @@ def main():
         f"# Claude Code transcript: {src}",
         f"# Total turns: {len(turns)}",
         f"# Date range : {min(stamps) if stamps else 'n/a'}  ->  {max(stamps) if stamps else 'n/a'}",
-        "#" + "-" * 70, "",
+        "#" + "-" * 70,
+        "",
     ]
     out.write("\n".join(header))
     for ts, role, text in turns:
@@ -71,5 +75,6 @@ def main():
         out.close()
         print(f"Wrote {len(turns)} turns to {sys.argv[2]}")
 
+
 if __name__ == "__main__":
     main()

From 670aba974f7fd537bc48bab63c90599972bb304c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 07:37:25 -0300
Subject: [PATCH 56/60] test(repair): close ChromaBackend in _seed_palace to
 release Windows file locks

The helper opened a chromadb PersistentClient via ChromaBackend and never
closed it, leaving rust-side SQLite/HNSW file locks alive after the
helper returned. On Windows that blocks the in-place archive rename
inside rebuild_from_sqlite with WinError 32 on data_level0.bin,
causing test_rebuild_from_sqlite_in_place_archives_when_opted_in and
test_rebuild_from_sqlite_raises_on_upsert_failure to fail in the
test-windows CI job. No test consumes the returned collection, so
closing the backend in a try/finally is safe and drops the return.
---
 tests/test_repair.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/test_repair.py b/tests/test_repair.py
index 8ca72fb..0277d9d 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -696,20 +696,24 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
 def _seed_palace(palace_path, collection_name, rows):
     """Build a real chromadb palace at ``palace_path`` and add ``rows``.
 
-    ``rows`` is a list of ``(id, document, metadata)`` tuples. Returns
-    the populated collection so callers can assert on the writer's view
-    of state before the SQLite read.
+    ``rows`` is a list of ``(id, document, metadata)`` tuples.
     """
     from mempalace.backends.chroma import ChromaBackend
 
     backend = ChromaBackend()
-    col = backend.create_collection(str(palace_path), collection_name)
-    col.upsert(
-        ids=[r[0] for r in rows],
-        documents=[r[1] for r in rows],
-        metadatas=[r[2] for r in rows],
-    )
-    return col
+    try:
+        col = backend.create_collection(str(palace_path), collection_name)
+        col.upsert(
+            ids=[r[0] for r in rows],
+            documents=[r[1] for r in rows],
+            metadatas=[r[2] for r in rows],
+        )
+    finally:
+        # Release chromadb's rust-side SQLite/HNSW file locks before the
+        # caller proceeds. Without this, an in-place rebuild on Windows
+        # fails with WinError 32 on data_level0.bin during the archive
+        # rename (cf. PR #1310 test-windows job).
+        backend.close()
 
 
 def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):

From 7cf9b1758222a525e9285f979844797d92c1d59e Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 07:53:28 -0300
Subject: [PATCH 57/60] fix(repair): quote ChromaBackend annotation for Python
 3.9 compatibility

`backend: ChromaBackend | None = None` evaluates the X | None union
eagerly at function-definition time, which Python 3.9 rejects with
TypeError: unsupported operand type(s) for |: 'ABCMeta' and 'NoneType'
since the new union syntax is 3.10+. Quoting matches the existing
forward-reference style in repair.py (sqlite_drawer_count, etc.) and
defers evaluation, restoring 3.9 compatibility.
---
 mempalace/repair.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/repair.py b/mempalace/repair.py
index 0585405..d7af0d9 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -593,7 +593,7 @@ def status(palace_path=None) -> dict:
 # ---------------------------------------------------------------------------
 
 
-def _close_chroma_handles(palace_path: str, backend: ChromaBackend | None = None) -> None:
+def _close_chroma_handles(palace_path: str, backend: "ChromaBackend | None" = None) -> None:
     """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
 
     When ``backend`` is provided, close the live instance so rollback/restore

From 5488e7bb2286e5d5770d68c2719acefe92efba41 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 08:56:41 -0300
Subject: [PATCH 58/60] fix(miner): harden Windows mine against ONNX bad_alloc
 + silent partial exits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small changes that together address the failure modes in #1296:

1. Add pnpm-lock.yaml and yarn.lock to SKIP_FILENAMES, mirroring the
   existing package-lock.json rule. A 24K-line pnpm-lock.yaml produced
   ~1124 chunks in one batch and tripped onnxruntime bad_alloc on
   Windows; pnpm/yarn lockfiles are no more useful to mine than npm's.

2. Skip any file that produces more than MAX_CHUNKS_PER_FILE (500)
   chunks, with a clear log line. Catches the broader class — generated
   CSV/JSON, build artifacts, etc. — that the named-file SKIP list will
   never fully cover. The cap is conservative (500 chunks * 800 chars ≈
   400 KB of source) so legitimate hand-written content still mines.

3. Print a partial-progress summary on any exception in _mine_impl, not
   just KeyboardInterrupt, then re-raise. Without this, an arbitrary
   exception (ONNX bad_alloc, chromadb HNSW error, OS fault) propagates
   silently — the operator sees only the last progress line and assumes
   the mine succeeded. The new path mirrors the KeyboardInterrupt
   summary (files_processed, drawers_filed, last_file) plus the
   exception type and message, then re-raises so the original traceback
   surfaces and the exit code is non-zero.

Tests cover: SKIP_FILENAMES contents, the chunk-cap path returning
(0, room) with no upserts, and the new mine-aborted summary surfacing
both the partial counters and the exception class.
---
 mempalace/miner.py  | 34 ++++++++++++++++++++
 tests/test_miner.py | 77 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/mempalace/miner.py b/mempalace/miner.py
index 88734c9..6aeddd4 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -66,6 +66,8 @@ SKIP_FILENAMES = {
     "mempal.yml",
     ".gitignore",
     "package-lock.json",
+    "pnpm-lock.yaml",
+    "yarn.lock",
 }
 
 CHUNK_SIZE = 800  # chars per drawer
@@ -73,6 +75,13 @@ CHUNK_OVERLAP = 100  # overlap between chunks
 MIN_CHUNK_SIZE = 50  # skip tiny chunks
 DRAWER_UPSERT_BATCH_SIZE = 1000
 MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
+# A single file producing more chunks than this is almost always a generated
+# artifact (CSV/JSON dump, lockfile not in SKIP_FILENAMES, etc.). Embedding
+# thousands of chunks from one file in one batch has triggered ONNX runtime
+# `bad allocation` errors on Windows (#1296). The cap is conservative: a
+# 500-chunk file at CHUNK_SIZE=800 is ~400 KB of source, which covers most
+# legitimate hand-written content while bounding the worst-case batch.
+MAX_CHUNKS_PER_FILE = 500
 # Long Claude Code sessions and large transcript exports routinely exceed
 # 10 MB. The cap exists as a defensive rail against pathological binary
 # files, not as a limit on legitimate text. Per-drawer size is bounded
@@ -825,6 +834,13 @@ def process_file(
     room = detect_room(filepath, content, rooms, project_path)
     chunks = chunk_text(content, source_file)
 
+    if len(chunks) > MAX_CHUNKS_PER_FILE:
+        print(
+            f"  ! [skip] {filepath.name[:50]:50} produced {len(chunks)} chunks "
+            f"(> {MAX_CHUNKS_PER_FILE}); add to SKIP_FILENAMES or .gitignore"
+        )
+        return 0, room
+
     if dry_run:
         print(f"    [DRY RUN] {filepath.name} -> room:{room} ({len(chunks)} drawers)")
         return len(chunks), room
@@ -1167,6 +1183,24 @@ def _mine_impl(
             "already-filed drawers are\n  upserted idempotently and will not duplicate.\n"
         )
         sys.exit(130)
+    except Exception as exc:
+        # Without this, an arbitrary exception (ONNX bad_alloc, chromadb HNSW
+        # error, OS fault) propagates and the process exits with no completion
+        # banner — the operator sees only the final progress line and assumes
+        # the mine succeeded (#1296). Print the partial-progress summary the
+        # way we do for KeyboardInterrupt, then re-raise so the original
+        # traceback still surfaces and the exit code is non-zero.
+        print("\n\n  Mine aborted by exception.")
+        print(f"    files_processed: {files_processed}/{len(files)}")
+        print(f"    drawers_filed:   {total_drawers}")
+        print(f"    last_file:       {last_file or '<none>'}")
+        print(f"    error:           {type(exc).__name__}: {exc}")
+        print(
+            f"\n  Re-run `mempalace mine {shlex.quote(project_dir)}` after addressing "
+            "the cause — already-filed\n  drawers are upserted idempotently and will "
+            "not duplicate.\n"
+        )
+        raise
     finally:
         # Clean up the hooks-side PID lock if it points at us. Stale
         # entries already pass _pid_alive() == False on POSIX, but
diff --git a/tests/test_miner.py b/tests/test_miner.py
index 10124ee..10dd33d 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -699,6 +699,83 @@ def test_mine_keyboard_interrupt_quotes_path_with_spaces_in_resume_hint(tmp_path
     assert f"mempalace mine {shlex.quote(str(project_root))}" in out
 
 
+def test_skip_filenames_includes_lockfiles():
+    """pnpm-lock.yaml and yarn.lock must be skipped alongside package-lock.json
+    so a Windows mine over a typical JS monorepo doesn't OOM the ONNX embedder
+    on a 24K-line lockfile (#1296)."""
+    from mempalace import miner
+
+    assert "package-lock.json" in miner.SKIP_FILENAMES
+    assert "pnpm-lock.yaml" in miner.SKIP_FILENAMES
+    assert "yarn.lock" in miner.SKIP_FILENAMES
+
+
+def test_process_file_skips_when_chunks_exceed_max(tmp_path, monkeypatch):
+    """A file producing more than MAX_CHUNKS_PER_FILE chunks must be skipped
+    with a clear message and zero upserts. Generated artifacts (CSVs, lock
+    files not in SKIP_FILENAMES) hit this — the cap is what prevents ONNX
+    bad_alloc on Windows when the embedder is asked to swallow thousands of
+    chunks in one batch (#1296)."""
+    from unittest.mock import MagicMock
+
+    from mempalace import miner
+
+    monkeypatch.setattr(miner, "MAX_CHUNKS_PER_FILE", 5)
+    over_cap = [{"content": f"chunk {i}", "chunk_index": i} for i in range(7)]
+    monkeypatch.setattr(miner, "chunk_text", lambda content, source_file: over_cap)
+
+    source = tmp_path / "huge.csv"
+    source.write_text("col1,col2\n" + "x,y\n" * 500, encoding="utf-8")
+    col = MagicMock()
+    col.get.return_value = {"ids": []}
+
+    drawers, room = miner.process_file(
+        source,
+        tmp_path,
+        col,
+        "wing",
+        [{"name": "general", "description": "General"}],
+        "agent",
+        False,
+    )
+
+    assert drawers == 0
+    col.upsert.assert_not_called()
+
+
+def test_mine_arbitrary_exception_prints_summary_and_reraises(tmp_path, capsys):
+    """A non-KeyboardInterrupt exception mid-mine must surface a summary
+    banner before propagating, so users don't see a silent exit-0 with no
+    completion message (#1296 Failure 2). Re-raise preserves the traceback
+    and yields a non-zero exit code."""
+    import pytest
+    from unittest.mock import patch
+
+    project_root = tmp_path / "proj"
+    project_root.mkdir()
+    _make_minable_project(project_root, n_files=4)
+    palace_path = project_root / "palace"
+
+    call_count = {"n": 0}
+
+    def fake_process_file(*args, **kwargs):
+        call_count["n"] += 1
+        if call_count["n"] == 2:
+            raise RuntimeError("simulated ONNX bad_alloc")
+        return (1, "general")
+
+    with patch("mempalace.miner.process_file", side_effect=fake_process_file):
+        with pytest.raises(RuntimeError, match="simulated ONNX bad_alloc"):
+            mine(str(project_root), str(palace_path))
+
+    out = capsys.readouterr().out
+    assert "Mine aborted by exception." in out
+    assert "files_processed: 1/" in out
+    assert "drawers_filed:" in out
+    assert "RuntimeError: simulated ONNX bad_alloc" in out
+    assert "upserted idempotently" in out
+
+
 def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
     """Our own PID entry in mine.pid is removed in the finally clause."""
     import pytest

From ec6d2dde0170387804730d57f13ea17db6c7c5f8 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Sat, 2 May 2026 00:16:29 -0600
Subject: [PATCH 59/60] fix: use configured collection in recovery paths

---
 mempalace/cli.py            |  8 +++-
 mempalace/config.py         |  8 ++++
 mempalace/mcp_server.py     | 61 ++++++++++++++++++++----
 mempalace/palace.py         |  7 ++-
 mempalace/repair.py         | 59 ++++++++++++++++--------
 mempalace/searcher.py       | 47 ++++++++++++-------
 tests/test_backends.py      | 23 ++++++++++
 tests/test_cli.py           | 44 ++++++++++++++++++
 tests/test_hnsw_capacity.py |  2 +
 tests/test_mcp_server.py    | 45 ++++++++++++++++--
 tests/test_repair.py        | 92 +++++++++++++++++++++++++++++++++++++
 tests/test_searcher.py      | 26 +++++++++--
 12 files changed, 369 insertions(+), 53 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 740de96..ac00283 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -663,8 +663,10 @@ def cmd_repair(args):
         check_extraction_safety,
     )
 
+    config = MempalaceConfig()
+    collection_name = config.collection_name
     palace_path = os.path.abspath(
-        os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+        os.path.expanduser(args.palace) if args.palace else config.palace_path
     )
 
     if getattr(args, "mode", "legacy") == "max-seq-id":
@@ -749,7 +751,7 @@ def cmd_repair(args):
 
     # Try to read existing drawers
     try:
-        col = backend.get_collection(palace_path, "mempalace_drawers")
+        col = backend.get_collection(palace_path, collection_name)
         total = col.count()
         print(f"  Drawers found: {total}")
     except Exception as e:
@@ -784,6 +786,7 @@ def cmd_repair(args):
             palace_path,
             len(all_ids),
             confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
+            collection_name=collection_name,
         )
     except TruncationDetected as e:
         print(e.message)
@@ -810,6 +813,7 @@ def cmd_repair(args):
             all_docs,
             all_metas,
             batch_size,
+            collection_name=collection_name,
             progress=print,
         )
     except RebuildCollectionError as e:
diff --git a/mempalace/config.py b/mempalace/config.py
index 2252a49..fd32a17 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -7,6 +7,7 @@ Priority: env vars > config file (~/.mempalace/config.json) > defaults
 import json
 import os
 import re
+from functools import lru_cache
 from pathlib import Path
 
 
@@ -127,6 +128,13 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:
 DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
 DEFAULT_COLLECTION_NAME = "mempalace_drawers"
 
+
+@lru_cache(maxsize=1)
+def get_configured_collection_name() -> str:
+    """Return the configured drawer collection name without repeated config-file reads."""
+    return MempalaceConfig().collection_name
+
+
 DEFAULT_TOPIC_WINGS = [
     "emotions",
     "consciousness",
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index bbb9c93..521cb07 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -193,7 +193,7 @@ def _refresh_vector_disabled_flag() -> None:
     """
     global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
     try:
-        info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
+        info = hnsw_capacity_status(_config.palace_path, _config.collection_name)
     except Exception:
         logger.debug("HNSW capacity probe raised", exc_info=True)
         return
@@ -490,6 +490,7 @@ def _tool_status_via_sqlite() -> dict:
     db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
     if not os.path.isfile(db_path):
         return _no_palace()
+    collection_name = _config.collection_name
 
     wings: dict = {}
     rooms: dict = {}
@@ -503,8 +504,9 @@ def _tool_status_via_sqlite() -> dict:
                 FROM embeddings e
                 JOIN segments s ON e.segment_id = s.id
                 JOIN collections c ON s.collection = c.id
-                WHERE c.name = 'mempalace_drawers'
-                """
+                WHERE c.name = ?
+                """,
+                (collection_name,),
             ).fetchone()
             total = int(row[0]) if row and row[0] is not None else 0
             for key, target in (("wing", wings), ("room", rooms)):
@@ -515,12 +517,12 @@ def _tool_status_via_sqlite() -> dict:
                     JOIN embeddings e ON em.id = e.id
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                       AND em.key = ?
                       AND em.string_value IS NOT NULL
                     GROUP BY em.string_value
                     """,
-                    (key,),
+                    (collection_name, key),
                 ):
                     target[value] = count
         finally:
@@ -720,6 +722,7 @@ def tool_search(
         n_results=limit,
         max_distance=dist,
         vector_disabled=_vector_disabled,
+        collection_name=_config.collection_name,
     )
     if _vector_disabled:
         result["vector_disabled"] = True
@@ -922,8 +925,8 @@ def tool_add_drawer(
 
     # Idempotency: if the deterministic ID already exists, return success as a no-op.
     try:
-        existing = col.get(ids=[drawer_id])
-        if existing and existing["ids"]:
+        existing = col.get(ids=[drawer_id], include=[])
+        if existing.ids:
             return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
     except Exception:
         logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
@@ -943,6 +946,12 @@ def tool_add_drawer(
                 }
             ],
         )
+        inserted = col.get(ids=[drawer_id], include=[])
+        if not inserted.ids:
+            raise RuntimeError(
+                "Drawer write was acknowledged but the new ID is not readable. "
+                "The palace index may be stale; run reconnect or repair."
+            )
         _metadata_cache = None
         logger.info(f"Filed drawer: {drawer_id} → {wing}/{room}")
         return {"success": True, "drawer_id": drawer_id, "wing": wing, "room": room}
@@ -1506,6 +1515,30 @@ def tool_reconnect():
         _palace_db_mtime, \
         _vector_disabled, \
         _vector_disabled_reason
+    from . import palace as palace_module
+
+    close_errors = []
+    try:
+        palace_module._DEFAULT_BACKEND.close_palace(_config.palace_path)
+    except Exception as exc:
+        logger.debug("Failed to close shared palace backend during reconnect", exc_info=True)
+        close_errors.append(f"backend close_palace failed: {exc}")
+    try:
+        from chromadb.api.client import SharedSystemClient
+
+        clear_system_cache = getattr(SharedSystemClient, "clear_system_cache", None)
+        if callable(clear_system_cache):
+            clear_system_cache()
+        else:
+            logger.debug(
+                "SharedSystemClient.clear_system_cache is unavailable; skipping shared Chroma cache clear during reconnect"
+            )
+    except Exception as exc:
+        logger.debug(
+            "Failed to clear Chroma shared system cache during reconnect",
+            exc_info=True,
+        )
+        close_errors.append(f"shared Chroma cache clear failed: {exc}")
     _client_cache = None
     _collection_cache = None
     _palace_db_inode = 0
@@ -1527,12 +1560,24 @@ def tool_reconnect():
     try:
         col = _get_collection()
         if col is None:
-            return {
+            result = {
                 "success": False,
                 "message": "No palace found after reconnect",
                 "drawers": 0,
                 "vector_disabled": _vector_disabled,
             }
+            if close_errors:
+                result["error"] = "; ".join(close_errors)
+            return result
+        if close_errors:
+            return {
+                "success": False,
+                "message": "Reconnect reopened the palace but failed to fully reset cached handles",
+                "drawers": col.count(),
+                "vector_disabled": _vector_disabled,
+                "vector_disabled_reason": _vector_disabled_reason,
+                "error": "; ".join(close_errors),
+            }
         return {
             "success": True,
             "message": "Reconnected to palace",
diff --git a/mempalace/palace.py b/mempalace/palace.py
index e5f6411..dee5c8f 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -10,6 +10,7 @@ import logging
 import os
 import re
 import threading
+from typing import Optional
 
 from .backends.chroma import ChromaBackend
 
@@ -56,10 +57,14 @@ NORMALIZE_VERSION = 2
 
 def get_collection(
     palace_path: str,
-    collection_name: str = "mempalace_drawers",
+    collection_name: Optional[str] = None,
     create: bool = True,
 ):
     """Get the palace collection through the backend layer."""
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()
     return _DEFAULT_BACKEND.get_collection(
         palace_path,
         collection_name=collection_name,
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 34d165c..b47bcd6 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -181,10 +181,12 @@ def _rebuild_collection_via_temp(
     all_docs,
     all_metas,
     batch_size: int,
+    collection_name: Optional[str] = None,
     progress=print,
 ) -> int:
     expected = len(all_ids)
-    temp_name = REPAIR_TEMP_COLLECTION
+    collection_name = collection_name or _drawers_collection_name()
+    temp_name = f"{collection_name}__repair_tmp"
     live_replaced = False
 
     try:
@@ -203,9 +205,9 @@ def _rebuild_collection_via_temp(
         _verify_collection_count(temp_col, expected, "temporary rebuild")
 
         progress("  Rebuilding live collection...")
-        backend.delete_collection(palace_path, COLLECTION_NAME)
+        backend.delete_collection(palace_path, collection_name)
         live_replaced = True
-        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+        new_col = backend.create_collection(palace_path, collection_name)
 
         rebuilt = 0
         for i in range(0, expected, batch_size):
@@ -230,7 +232,7 @@ def _rebuild_collection_via_temp(
         raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
 
 
-def scan_palace(palace_path=None, only_wing=None):
+def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
     """Scan the palace for corrupt/unfetchable IDs.
 
     Probes in batches of 100, falls back to per-ID on failure.
@@ -239,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
     Returns (good_set, bad_set).
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     print(f"\n  Palace: {palace_path}")
     print("  Loading...")
 
-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
 
     where = {"wing": only_wing} if only_wing else None
     total = col.count()
-    print(f"  Collection: {COLLECTION_NAME}, total: {total:,}")
+    print(f"  Collection: {collection_name}, total: {total:,}")
     if only_wing:
         print(f"  Scanning wing: {only_wing}")
 
@@ -307,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
     return good_set, bad_set
 
 
-def prune_corrupt(palace_path=None, confirm=False):
+def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
     """Delete corrupt IDs listed in corrupt_ids.txt."""
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     bad_file = os.path.join(palace_path, "corrupt_ids.txt")
 
     if not os.path.exists(bad_file):
@@ -325,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
         print("  Re-run with --confirm to actually delete.")
         return
 
-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
     before = col.count()
     print(f"  Collection size before: {before:,}")
 
@@ -379,7 +383,10 @@ class TruncationDetected(Exception):
 
 
 def check_extraction_safety(
-    palace_path: str, extracted: int, confirm_truncation_ok: bool = False
+    palace_path: str,
+    extracted: int,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
 ) -> None:
     """Cross-check that ``extracted`` matches the SQLite ground truth.
 
@@ -401,7 +408,8 @@ def check_extraction_safety(
     if confirm_truncation_ok:
         return
 
-    sqlite_count = sqlite_drawer_count(palace_path)
+    collection_name = collection_name or _drawers_collection_name()
+    sqlite_count = sqlite_drawer_count(palace_path, collection_name)
     cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
 
     if sqlite_count is not None and sqlite_count > extracted:
@@ -437,7 +445,7 @@ def check_extraction_safety(
         raise TruncationDetected(message, sqlite_count, extracted)
 
 
-def sqlite_drawer_count(palace_path: str) -> "int | None":
+def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
     """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
 
     Used as an independent ground-truth check against the chromadb
@@ -449,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
     drift, missing tables, locked file). Callers treat ``None`` as
     "unknown" and fall back to the cap-detection check.
     """
+    collection_name = collection_name or _drawers_collection_name()
     sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
     if not os.path.exists(sqlite_path):
         return None
@@ -465,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
                 JOIN collections c ON s.collection = c.id
                 WHERE c.name = ?
                 """,
-                (COLLECTION_NAME,),
+                (collection_name,),
             ).fetchone()
             return int(row[0]) if row and row[0] is not None else None
         finally:
@@ -477,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
         return None
 
 
-def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
+def rebuild_index(
+    palace_path=None,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
+):
     """Rebuild the HNSW index from scratch.
 
     1. Extract all drawers via ChromaDB get()
@@ -492,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     (typically only a concern for palaces sized at exactly 10 000 rows).
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
 
     if not os.path.isdir(palace_path):
         print(f"\n  No palace found at {palace_path}")
@@ -504,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
 
     backend = ChromaBackend()
     try:
-        col = backend.get_collection(palace_path, COLLECTION_NAME)
+        col = backend.get_collection(palace_path, collection_name)
         total = col.count()
     except Exception as e:
         print(f"  Error reading palace: {e}")
@@ -528,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     # short of the SQLite ground truth (or when extraction == chromadb
     # default get() cap and the SQLite check couldn't run).
     try:
-        check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
+        check_extraction_safety(
+            palace_path,
+            len(all_ids),
+            confirm_truncation_ok,
+            collection_name=collection_name,
+        )
     except TruncationDetected as e:
         print(e.message)
         return
@@ -551,6 +570,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
             all_docs,
             all_metas,
             batch_size,
+            collection_name=collection_name,
             progress=print,
         )
     except RebuildCollectionError as e:
@@ -560,7 +580,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
             print(f"  Restoring from backup: {backup_path}")
             try:
                 _close_chroma_handles(palace_path, backend=backend)
-                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                _delete_collection_if_exists(backend, palace_path, collection_name)
                 shutil.copy2(backup_path, sqlite_path)
                 print("  Backup restored. Palace is back to pre-repair state.")
             except Exception as restore_error:
@@ -950,7 +970,7 @@ def rebuild_from_sqlite(
         backend.close()
 
 
-def status(palace_path=None) -> dict:
+def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
     """Read-only health check: compare sqlite vs HNSW element counts.
 
     Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -968,6 +988,7 @@ def status(palace_path=None) -> dict:
     ``status="unknown"`` when no palace exists at the given path.
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     print(f"\n{'=' * 55}")
     print("  MemPalace Repair — Status")
     print(f"{'=' * 55}\n")
@@ -977,8 +998,8 @@ def status(palace_path=None) -> dict:
         print("  No palace found.\n")
         return {"status": "unknown", "message": "no palace at path"}
 
-    drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
-    closets = hnsw_capacity_status(palace_path, "mempalace_closets")
+    drawers = hnsw_capacity_status(palace_path, collection_name)
+    closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)
 
     for label, info in (("drawers", drawers), ("closets", closets)):
         print(f"\n  [{label}]")
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index b318d99..f644fda 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -382,6 +382,7 @@ def _bm25_only_via_sqlite(
     n_results: int = 5,
     max_candidates: int = 500,
     _include_internal: bool = False,
+    collection_name: str = None,
 ) -> dict:
     """BM25-only search reading drawers directly from chroma.sqlite3.
 
@@ -405,6 +406,10 @@ def _bm25_only_via_sqlite(
             "error": "No palace found",
             "hint": "Run: mempalace init <dir> && mempalace mine <dir>",
         }
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()
 
     def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
         clauses = []
@@ -441,35 +446,43 @@ def _bm25_only_via_sqlite(
         # shorter than 3 chars (trigram tokenizer can't match them).
         tokens = [t for t in _tokenize(query) if len(t) >= 3]
         candidate_ids: list[int] = []
+        use_recency_fallback = not tokens
         if tokens:
             fts_query = " OR ".join(tokens)
             filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
             try:
                 rows = conn.execute(
                     f"""
-                    SELECT rowid
+                    SELECT embedding_fulltext_search.rowid
                     FROM embedding_fulltext_search
+                    JOIN embeddings e ON e.id = embedding_fulltext_search.rowid
+                    JOIN segments s ON e.segment_id = s.id
+                    JOIN collections c ON s.collection = c.id
                     WHERE embedding_fulltext_search MATCH ?
+                      AND c.name = ?
                     {filter_sql}
                     LIMIT ?
                     """,
-                    (fts_query, *filter_params, max_candidates),
+                    (fts_query, collection_name, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
                 # FTS5 tokenizer mismatch or syntax error — fall through
                 # to the recency-window selector below.
                 logger.debug("FTS5 MATCH failed; using recency fallback", exc_info=True)
+                use_recency_fallback = True
 
-        if not candidate_ids:
-            # No FTS hits (or no usable tokens) — pull the most recent
-            # rows for the drawers segment so we can BM25-rank something
-            # rather than return empty-handed. Wrapped in try/except
-            # because the schema may differ on legacy palaces (older
-            # chromadb without ``created_at``, missing ``segments``
-            # rows after partial restore, etc.); on schema mismatch we
-            # fall back to ordering by primary-key id and finally to an
-            # empty result rather than letting search raise.
+        if not candidate_ids and use_recency_fallback:
+            # No usable FTS tokens, or FTS itself failed — pull the most
+            # recent rows for the drawers segment so we can BM25-rank
+            # something rather than return empty-handed. A clean FTS miss
+            # must stay empty, especially after wing/room filtering, because
+            # recency fallback would return unrelated scoped drawers.
+            # Wrapped in try/except because the schema may differ on legacy
+            # palaces (older chromadb without ``created_at``, missing
+            # ``segments`` rows after partial restore, etc.); on schema
+            # mismatch we fall back to ordering by primary-key id and finally
+            # to an empty result rather than letting search raise.
             try:
                 filter_sql, filter_params = _metadata_filter_sql("e.id")
                 rows = conn.execute(
@@ -478,12 +491,12 @@ def _bm25_only_via_sqlite(
                     FROM embeddings e
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                     {filter_sql}
                     ORDER BY e.created_at DESC
                     LIMIT ?
                     """,
-                    (*filter_params, max_candidates),
+                    (collection_name, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -499,12 +512,12 @@ def _bm25_only_via_sqlite(
                         FROM embeddings e
                         JOIN segments s ON e.segment_id = s.id
                         JOIN collections c ON s.collection = c.id
-                        WHERE c.name = 'mempalace_drawers'
+                        WHERE c.name = ?
                         {filter_sql}
                         ORDER BY e.id DESC
                         LIMIT ?
                         """,
-                        (*filter_params, max_candidates),
+                        (collection_name, *filter_params, max_candidates),
                     ).fetchall()
                     candidate_ids = [r[0] for r in rows]
                 except sqlite3.Error:
@@ -720,6 +733,7 @@ def search_memories(
     max_distance: float = 0.0,
     vector_disabled: bool = False,
     candidate_strategy: str = "vector",
+    collection_name: str = None,
 ) -> dict:
     """Programmatic search — returns a dict instead of printing.
 
@@ -770,10 +784,11 @@ def search_memories(
             wing=wing,
             room=room,
             n_results=n_results,
+            collection_name=collection_name,
         )
 
     try:
-        drawers_col = get_collection(palace_path, create=False)
+        drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
     except Exception as e:
         logger.error("No palace found at %s: %s", palace_path, e)
         return {
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 4cd9480..06625fc 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1194,3 +1194,26 @@ def test_chroma_backend_requarantines_after_inode_replacement(tmp_path, monkeypa
         ("invalid", str(palace)),
         ("stale", str(palace)),
     ]
+
+
+def test_palace_get_collection_uses_configured_collection_name(monkeypatch):
+    from mempalace import palace
+
+    captured = {}
+
+    def fake_get_collection(palace_path, collection_name=None, create=False):
+        captured["palace_path"] = palace_path
+        captured["collection_name"] = collection_name
+        captured["create"] = create
+        return object()
+
+    monkeypatch.setattr(palace._DEFAULT_BACKEND, "get_collection", fake_get_collection)
+    monkeypatch.setattr("mempalace.config.get_configured_collection_name", lambda: "custom_drawers")
+
+    palace.get_collection("/palace", create=False)
+
+    assert captured == {
+        "palace_path": "/palace",
+        "collection_name": "custom_drawers",
+        "create": False,
+    }
diff --git a/tests/test_cli.py b/tests/test_cli.py
index de00664..0b61b0c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -776,6 +776,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_backend = MagicMock()
     mock_backend.get_collection.side_effect = Exception("corrupt db")
@@ -791,6 +792,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_col = MagicMock()
     mock_col.count.return_value = 0
@@ -807,6 +809,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
     mock_col = MagicMock()
     mock_col.count.return_value = 2
@@ -836,12 +839,52 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
     mock_new_col.add.assert_not_called()
 
 
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "custom_drawers"
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
+        cmd_repair(args)
+
+    out = capsys.readouterr().out
+    assert "Repair complete" in out
+    mock_backend.get_collection.assert_called_once_with(str(palace_dir), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+    ]
+
+
 @patch("mempalace.cli.MempalaceConfig")
 def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
     mock_col = MagicMock()
     mock_col.count.return_value = 2
@@ -875,6 +918,7 @@ def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsy
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_col = MagicMock()
     mock_col.count.return_value = 1
diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py
index 912def8..53775b0 100644
--- a/tests/test_hnsw_capacity.py
+++ b/tests/test_hnsw_capacity.py
@@ -260,6 +260,7 @@ def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, mon
 
     class _Cfg:
         palace_path = str(tmp_path)
+        collection_name = "mempalace_drawers"
 
     monkeypatch.setattr(mcp_server, "_config", _Cfg())
     monkeypatch.setattr(mcp_server, "_vector_disabled", True)
@@ -625,6 +626,7 @@ def test_tool_status_via_sqlite_returns_breakdown(palace_with_drawers, monkeypat
     # MempalaceConfig.
     class _Cfg:
         palace_path = str(palace_with_drawers)
+        collection_name = "mempalace_drawers"
 
     monkeypatch.setattr(mcp_server, "_config", _Cfg())
     monkeypatch.setattr(mcp_server, "_vector_disabled", True)
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index ae20bf3..613cc23 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -484,6 +484,26 @@ class TestWriteTools:
         assert result2["success"] is True
         assert result2["reason"] == "already_exists"
 
+    def test_add_drawer_fails_when_readback_misses(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        class _FakeGetResult:
+            ids = []
+
+        class _FakeCol:
+            def get(self, **kwargs):
+                return _FakeGetResult()
+
+            def upsert(self, **kwargs):
+                return None
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_add_drawer("w", "r", "content")
+        assert result["success"] is False
+        assert "not readable" in result["error"]
+
     def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace_path, kg):
         """Documents sharing a >100-char header must get distinct IDs (full-content hash)."""
         _patch_mcp_server(monkeypatch, config, kg)
@@ -503,9 +523,9 @@ class TestWriteTools:
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert (
-            result1["drawer_id"] != result2["drawer_id"]
-        ), "Documents with shared header but different content must have distinct drawer IDs"
+        assert result1["drawer_id"] != result2["drawer_id"], (
+            "Documents with shared header but different content must have distinct drawer IDs"
+        )
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1158,6 +1178,25 @@ class TestCacheInvalidation:
         assert "Reconnected" in result["message"]
         assert isinstance(result["drawers"], int)
 
+    def test_reconnect_closes_shared_backend(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from unittest.mock import MagicMock
+
+        from mempalace import mcp_server, palace
+
+        close_palace = MagicMock()
+        monkeypatch.setattr(palace._DEFAULT_BACKEND, "close_palace", close_palace)
+
+        class _FakeCol:
+            def count(self):
+                return 7
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_reconnect()
+        assert result["success"] is True
+        close_palace.assert_called_once_with(config.palace_path)
+
     def test_get_collection_create_true_avoids_get_or_create_on_reopen(
         self, monkeypatch, config, palace_path, kg
     ):
diff --git a/tests/test_repair.py b/tests/test_repair.py
index a60836a..8e9f95b 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -28,6 +28,16 @@ def test_get_palace_path_fallback():
         assert ".mempalace" in result
 
 
+def test_get_collection_name_from_config():
+    from mempalace.config import get_configured_collection_name
+
+    get_configured_collection_name.cache_clear()
+    with patch("mempalace.config.MempalaceConfig") as mock_config_cls:
+        mock_config_cls.return_value.collection_name = "custom_drawers"
+        assert repair._drawers_collection_name() == "custom_drawers"
+    get_configured_collection_name.cache_clear()
+
+
 # ── _paginate_ids ─────────────────────────────────────────────────────
 
 
@@ -330,6 +340,21 @@ def test_check_extraction_safety_passes_when_counts_match(tmp_path):
         repair.check_extraction_safety(str(tmp_path), 500)
 
 
+def test_check_extraction_safety_uses_configured_collection(tmp_path):
+    with patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count:
+        repair.check_extraction_safety(str(tmp_path), 500, collection_name="custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
+def test_check_extraction_safety_default_uses_configured_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count,
+    ):
+        repair.check_extraction_safety(str(tmp_path), 500)
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
 def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
     """SQLite check fails (None) but extraction is well under the cap → safe."""
     with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
@@ -384,6 +409,73 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
     assert repair.sqlite_drawer_count(str(tmp_path)) is None
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=2) as count,
+    ):
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    mock_backend.get_collection.assert_called_once_with(str(tmp_path), "custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+    ]
+
+
+def test_status_default_uses_configured_drawer_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.hnsw_capacity_status") as capacity_status,
+    ):
+        capacity_status.side_effect = [
+            {
+                "sqlite_count": 1,
+                "hnsw_count": 1,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+            {
+                "sqlite_count": 0,
+                "hnsw_count": 0,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+        ]
+        repair.status(palace_path=str(tmp_path))
+
+    assert capacity_status.call_args_list[0].args == (str(tmp_path), "custom_drawers")
+    assert capacity_status.call_args_list[1].args == (str(tmp_path), "mempalace_closets")
+
+
 @patch("mempalace.repair.shutil")
 @patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 4f0b4c0..60bef9f 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -84,6 +84,24 @@ class TestSearchMemories:
         assert "error" in result
         assert "query failed" in result["error"]
 
+    def test_search_memories_vector_path_uses_explicit_collection_name(self):
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "documents": [[]],
+            "metadatas": [[]],
+            "distances": [[]],
+            "ids": [[]],
+        }
+
+        with patch("mempalace.searcher.get_collection", return_value=mock_col) as get_collection:
+            search_memories("test", "/fake/path", collection_name="custom_drawers")
+
+        get_collection.assert_called_once_with(
+            "/fake/path",
+            collection_name="custom_drawers",
+            create=False,
+        )
+
     def test_search_memories_filters_in_result(self, palace_path, seeded_collection):
         result = search_memories("test", palace_path, wing="project", room="backend")
         assert result["filters"]["wing"] == "project"
@@ -102,7 +120,7 @@ class TestSearchMemories:
             "ids": [["d1", "d2"]],
         }
 
-        def mock_get_collection(path, create=False):
+        def mock_get_collection(path, collection_name=None, create=False):
             # First call: drawers. Second call: closets — raise so hybrid
             # degrades to pure drawer search (the catch block covers it).
             if not hasattr(mock_get_collection, "_called"):
@@ -309,9 +327,9 @@ class TestSearchCLI:
         captured = capsys.readouterr()
         first_block, _, _ = captured.out.partition("[2]")
         # Lexical match must rank first
-        assert (
-            "b.md" in first_block
-        ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
+        assert "b.md" in first_block, (
+            f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
+        )
         # Non-zero bm25 reported
         assert "bm25=" in first_block
         assert "bm25=0.0" not in first_block

From e9aee194335a446787f2e2a007ff28e11269f385 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 09:10:22 -0300
Subject: [PATCH 60/60] fix(tests): apply ruff format after rebase resolution

The collection_name plumbing rebase produced a few unformatted blocks
in test_mcp_server.py and test_searcher.py; bringing them in line with
the 0.4.x CI pin so test-windows / lint stay green.
---
 tests/test_mcp_server.py | 6 +++---
 tests/test_searcher.py   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 613cc23..1f47192 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -523,9 +523,9 @@ class TestWriteTools:
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert result1["drawer_id"] != result2["drawer_id"], (
-            "Documents with shared header but different content must have distinct drawer IDs"
-        )
+        assert (
+            result1["drawer_id"] != result2["drawer_id"]
+        ), "Documents with shared header but different content must have distinct drawer IDs"
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 60bef9f..f4d46a0 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -327,9 +327,9 @@ class TestSearchCLI:
         captured = capsys.readouterr()
         first_block, _, _ = captured.out.partition("[2]")
         # Lexical match must rank first
-        assert "b.md" in first_block, (
-            f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
-        )
+        assert (
+            "b.md" in first_block
+        ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
         # Non-zero bm25 reported
         assert "bm25=" in first_block
         assert "bm25=0.0" not in first_block