merge: develop into fix/1274-missing-hnsw-metadata-gate

Two conflicts, both because #1339 (bloated link payloads) merged into develop after this branch was authored: - mempalace/backends/chroma.py: _segment_appears_healthy now stacks three checks — bloated-link from #1339 (top), missing-metadata-with- data-floor from this branch (middle), pickle format sniff (bottom). All three are complementary; #1339 catches structural payload corruption, this branch catches pickle truncation, the original catches pickle protocol-byte corruption. - tests/test_backends.py: kept both new imports (_segment_appears_healthy from this branch, quarantine_invalid_hnsw_metadata from #1285). Local: 1618 tests pass, ruff lint+format clean against 0.4.x CI pin.
2026-05-07 10:35:19 -03:00
parent eff844b168 88a2ebb85a
commit a1e19081d9
49 changed files with 5338 additions and 363 deletions
@@ -40,8 +40,9 @@ def _patch_mcp_config(monkeypatch, palace_path, tmp_path):

    import mempalace.mcp_server as mcp_mod

+    kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
    monkeypatch.setattr(mcp_mod, "_config", cfg)
-    monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+    monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)


 def _get_rss_mb():
@@ -84,8 +84,9 @@ class TestToolStatusMemoryProfile:

        cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))
        monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})
+        kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
        monkeypatch.setattr(mcp_mod, "_config", cfg)
-        monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+        monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)

        from mempalace.mcp_server import tool_status

@@ -1,4 +1,6 @@
 import os
+import pickle
+import shutil
 import sqlite3
 from pathlib import Path

@@ -20,6 +22,7 @@ from mempalace.backends.chroma import (
    _fix_blob_seq_ids,
    _pin_hnsw_threads,
    _segment_appears_healthy,
+    quarantine_invalid_hnsw_metadata,
    quarantine_stale_hnsw,
 )

@@ -208,6 +211,52 @@ def test_query_empty_preserves_embeddings_outer_shape_when_requested():
    assert not_requested.embeddings is None


+def test_chroma_close_palace_releases_sqlite_lock_for_reopen(tmp_path):
+    """close_palace must release chromadb's rust-side SQLite file lock so
+    a fresh PersistentClient on the same path after shutil.rmtree can
+    write without hitting SQLITE_READONLY_DBMOVED."""
+    backend = ChromaBackend()
+    palace_path = tmp_path / "palace-a"
+    ref = PalaceRef(id=str(palace_path), local_path=str(palace_path))
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["hello"], ids=["a"], metadatas=[{"k": "v"}])
+
+    backend.close_palace(ref)
+    shutil.rmtree(palace_path)
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["world"], ids=["b"], metadatas=[{"k": "v2"}])
+    assert col.count() == 1
+
+
+def test_chroma_close_releases_all_cached_clients(tmp_path):
+    """close() must release every cached client's SQLite file lock so any
+    of their palace paths can be reopened by a fresh backend in the same
+    process."""
+    backend = ChromaBackend()
+    palace_a = tmp_path / "palace-a"
+    palace_b = tmp_path / "palace-b"
+    ref_a = PalaceRef(id=str(palace_a), local_path=str(palace_a))
+    ref_b = PalaceRef(id=str(palace_b), local_path=str(palace_b))
+
+    for ref in (ref_a, ref_b):
+        backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True).upsert(
+            documents=["x"], ids=["x"], metadatas=[{"k": "v"}]
+        )
+
+    backend.close()
+
+    for path in (palace_a, palace_b):
+        shutil.rmtree(path)
+        ref = PalaceRef(id=str(path), local_path=str(path))
+        fresh = ChromaBackend()
+        col = fresh.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+        col.upsert(documents=["y"], ids=["y"], metadatas=[{"k": "v2"}])
+        assert col.count() == 1
+        fresh.close()
+
+
 def test_chroma_cache_invalidates_when_db_file_missing(tmp_path):
    """A palace rebuild that removes chroma.sqlite3 must drop the stale cache.

@@ -756,7 +805,10 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
    """Quarantine fires on first ``make_client()`` for a palace, then is
    skipped on subsequent calls — prevents runtime thrash where a daemon's
    own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
-    making the mtime heuristic falsely trigger every reconnect."""
+    making the mtime heuristic falsely trigger every reconnect.
+
+    Invalid metadata quarantine shares the same cold-start gate here; the
+    more aggressive refresh path lives in ``_client()``."""
    from mempalace.backends.chroma import ChromaBackend

    palace_path = str(tmp_path / "palace")
@@ -783,6 +835,34 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"


+def test_make_client_gates_invalid_metadata_on_first_call(tmp_path, monkeypatch):
+    """Invalid metadata quarantine is gated on the first make_client() call."""
+    from mempalace.backends.chroma import ChromaBackend
+
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _invalid(path, *args, **kwargs):
+        calls.append(path)
+        return []
+
+    def _stale(path, stale_seconds=300.0):
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _invalid)
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _stale)
+
+    ChromaBackend.make_client(palace_path)
+    ChromaBackend.make_client(palace_path)
+
+    assert calls == [palace_path]
+
+
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
    """Two distinct palaces each get one quarantine attempt — the gate is
    keyed by palace path, not global."""
@@ -920,3 +1000,268 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
    )

    assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
+
+
+def test_quarantine_invalid_hnsw_metadata_renames_missing_dimensionality(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {"a": 1}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_allows_uninitialized_segment(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_dict_id_to_label(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": 8, "id_to_label": ["a", "b"]}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_schema_payload(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(["not", "a", "metadata", "object"], f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def _dangerous_pickle_payload_executed():
+    raise AssertionError("unsafe pickle payload executed")
+
+
+class _DangerousPickle:
+    def __reduce__(self):
+        return (_dangerous_pickle_payload_executed, ())
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_unsafe_pickle(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(_DangerousPickle(), f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_transient_read_errors(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(EOFError("flush in progress")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_truncated_pickle(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(pickle.UnpicklingError("pickle data was truncated")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_chroma_backend_preflights_metadata_before_persistent_client(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]
+
+
+def test_chroma_backend_stale_quarantine_is_cold_start_only_on_refresh(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (1, 2.0), (1, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+    ]
+
+
+def test_chroma_backend_requarantines_after_inode_replacement(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (2, 2.0), (2, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]
+
+
+def test_palace_get_collection_uses_configured_collection_name(monkeypatch):
+    from mempalace import palace
+
+    captured = {}
+
+    def fake_get_collection(palace_path, collection_name=None, create=False):
+        captured["palace_path"] = palace_path
+        captured["collection_name"] = collection_name
+        captured["create"] = create
+        return object()
+
+    monkeypatch.setattr(palace._DEFAULT_BACKEND, "get_collection", fake_get_collection)
+    monkeypatch.setattr("mempalace.config.get_configured_collection_name", lambda: "custom_drawers")
+
+    palace.get_collection("/palace", create=False)
+
+    assert captured == {
+        "palace_path": "/palace",
+        "collection_name": "custom_drawers",
+        "create": False,
+    }
@@ -0,0 +1,321 @@
+"""Tests for ChromaCollection's palace-write-lock integration.
+
+Closes the gap left by ``mine_palace_lock`` only protecting the
+``mempalace mine`` pipeline: MCP/direct writers that call
+``ChromaCollection.add/upsert/update/delete`` must also serialize against
+mine and against each other to avoid the multi-threaded HNSW corruption
+documented in #974/#965.
+
+Property tested:
+
+* ``ChromaCollection(c, palace_path=p)`` wraps every write with
+  ``mine_palace_lock(p)``.
+* Writes raise ``MineAlreadyRunning`` when another holder owns the lock
+  (instead of silently racing into the underlying chromadb call).
+* Re-entrant composition with ``miner.mine()`` does not self-deadlock:
+  ``with mine_palace_lock(p): col.upsert(...)`` runs to completion.
+* ``ChromaCollection(c)`` (no palace_path) preserves legacy no-lock
+  behaviour for tests/callers that build the adapter directly without
+  going through ``ChromaBackend``.
+
+POSIX-only: ``mine_palace_lock`` uses ``fcntl`` on Unix and ``msvcrt`` on
+Windows; the contention semantics differ enough that the cross-process
+tests are skipped on Windows runners.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import time
+
+import pytest
+
+from mempalace.backends.chroma import ChromaCollection
+from mempalace.palace import MineAlreadyRunning, mine_palace_lock
+
+
+def _get_mp_context():
+    """Same start-method picker as test_palace_locks.py."""
+    start_method = "spawn" if os.name == "nt" else "fork"
+    return multiprocessing.get_context(start_method)
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeChromaCollection:
+    """Records calls; never blocks. Stand-in for chromadb.Collection."""
+
+    def __init__(self):
+        self.adds: list[dict] = []
+        self.upserts: list[dict] = []
+        self.updates: list[dict] = []
+        self.deletes: list[dict] = []
+
+    def add(self, **kwargs):
+        self.adds.append(kwargs)
+
+    def upsert(self, **kwargs):
+        self.upserts.append(kwargs)
+
+    def update(self, **kwargs):
+        self.updates.append(kwargs)
+
+    def delete(self, **kwargs):
+        self.deletes.append(kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _hold_lock(palace_path: str, ready_flag: str, release_flag: str) -> int:
+    """Acquire ``mine_palace_lock``, signal readiness, wait for release.
+
+    Mirrors the helper in ``test_palace_locks.py`` so the contention
+    semantics match across both test files.
+    """
+    try:
+        with mine_palace_lock(palace_path):
+            open(ready_flag, "w").close()
+            for _ in range(500):
+                if os.path.exists(release_flag):
+                    return 0
+                time.sleep(0.01)
+            return 0
+    except MineAlreadyRunning:
+        return 1
+
+
+# ---------------------------------------------------------------------------
+# Tests — opt-in lock wiring
+# ---------------------------------------------------------------------------
+
+
+def test_palace_path_none_skips_lock(tmp_path, monkeypatch):
+    """Legacy callers (``ChromaCollection(c)``) keep no-lock behaviour.
+
+    A ``ChromaCollection`` built without ``palace_path`` must not touch the
+    lock infrastructure at all. This guards against regressions where a
+    test or third-party caller relies on the historical bare-write path.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake)  # no palace_path -> no lock
+
+    # Hold the lock in a child process. Without palace_path, the parent
+    # write must still succeed (the lock does not gate this caller).
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        col.upsert(documents=["doc"], ids=["id-1"])
+        assert fake.upserts == [{"documents": ["doc"], "ids": ["id-1"]}]
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_writer_blocks_during_mine(tmp_path, monkeypatch):
+    """A held ``mine_palace_lock`` causes ``ChromaCollection`` writes to raise.
+
+    This is the property that closes the MCP-bypass gap: when a mine is in
+    flight, MCP/direct writes raise ``MineAlreadyRunning`` rather than
+    silently entering chromadb's write path concurrent with mine.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        fake = _FakeChromaCollection()
+        col = ChromaCollection(fake, palace_path=palace)
+
+        with pytest.raises(MineAlreadyRunning):
+            col.upsert(documents=["doc"], ids=["id-1"])
+        with pytest.raises(MineAlreadyRunning):
+            col.add(documents=["doc"], ids=["id-2"])
+        with pytest.raises(MineAlreadyRunning):
+            col.update(ids=["id-3"], documents=["doc"])
+        with pytest.raises(MineAlreadyRunning):
+            col.delete(ids=["id-4"])
+
+        # The fake must have received NO calls — the lock must gate
+        # before reaching the underlying chromadb layer.
+        assert fake.upserts == []
+        assert fake.adds == []
+        assert fake.updates == []
+        assert fake.deletes == []
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_inside_mine_passes_through(tmp_path, monkeypatch):
+    """``ChromaCollection.upsert`` inside ``mine_palace_lock`` does not deadlock.
+
+    ``miner.mine()`` already holds ``mine_palace_lock(palace_path)`` for the
+    full mine pipeline; ``_mine_body`` then calls
+    ``collection.upsert(...)``. With the per-thread re-entrant guard in
+    ``mine_palace_lock``, the inner acquire is a pass-through and the
+    underlying chromadb call runs immediately.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake, palace_path=palace)
+
+    with mine_palace_lock(palace):
+        # If the re-entrant guard were missing, this would self-deadlock on
+        # the underlying flock. We rely on pytest-timeout (configured in
+        # pyproject.toml) to enforce this in CI; the assertion just confirms
+        # the call landed.
+        col.upsert(documents=["d"], ids=["i"], metadatas=[{"k": "v"}])
+        col.add(documents=["d2"], ids=["i2"])
+        col.update(ids=["i"], documents=["d-updated"])
+        col.delete(ids=["i2"])
+
+    assert len(fake.upserts) == 1
+    assert len(fake.adds) == 1
+    assert len(fake.updates) == 1
+    assert len(fake.deletes) == 1
+
+
+class _SlowFakeChromaCollection(_FakeChromaCollection):
+    """Fake whose write methods hold the caller for ``hold_seconds``.
+
+    Used to keep ``mine_palace_lock`` acquired long enough for a sibling
+    process to contend deterministically.
+    """
+
+    def __init__(self, hold_seconds: float = 0.3):
+        super().__init__()
+        self._hold = hold_seconds
+
+    def upsert(self, **kwargs):
+        time.sleep(self._hold)
+        super().upsert(**kwargs)
+
+
+def _slow_writer_target(palace_path, tmp_path_str, pid, result_q):
+    """Subprocess target: try a slow upsert, report ok/busy."""
+    os.environ["HOME"] = tmp_path_str
+    # Fresh import inside child so HOME monkeypatch routes the lock dir.
+    from mempalace.backends.chroma import ChromaCollection as _CC
+    from mempalace.palace import MineAlreadyRunning as _MAR
+
+    fake = _SlowFakeChromaCollection(hold_seconds=0.3)
+    col = _CC(fake, palace_path=palace_path)
+    try:
+        col.upsert(documents=[f"d{pid}"], ids=[f"i{pid}"])
+        result_q.put(("ok", pid))
+    except _MAR:
+        result_q.put(("busy", pid))
+
+
+def test_concurrent_writers_serialize(tmp_path, monkeypatch):
+    """Two processes calling ``ChromaCollection.upsert`` against the same
+    palace must be serialized: at most one enters chromadb at a time, the
+    other raises ``MineAlreadyRunning``.
+
+    This is the property that prevents the parallel HNSW insert race that
+    drives #974/#965 — under concurrent MCP write fan-out, exactly one
+    writer reaches chromadb and the rest fail loudly instead of corrupting
+    the index.
+
+    The slow fake holds the lock for 0.3s per writer, large enough for the
+    second process to contend even on slow CI runners.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+
+    ctx = _get_mp_context()
+    result_q = ctx.Queue()
+
+    p1 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q))
+    p2 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q))
+    p1.start()
+    # Tiny stagger so p1 wins the race deterministically; without it the
+    # OS scheduler can pick either, which is also a valid outcome but
+    # makes the assertion brittle on slow CI.
+    time.sleep(0.05)
+    p2.start()
+    p1.join(timeout=5)
+    p2.join(timeout=5)
+
+    outcomes = [result_q.get(timeout=1) for _ in range(2)]
+    statuses = sorted(o[0] for o in outcomes)
+    assert statuses == ["busy", "ok"], f"expected one ok + one busy, got {outcomes}"
+
+
+def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
+    """``query`` / ``get`` / ``count`` must not be gated by the write lock.
+
+    Read traffic is the dominant workload (semantic search, MCP get, etc.)
+    and serializing it against mine would tank latency for no correctness
+    benefit. This test pins that property: with another process holding
+    the write lock, reads must still complete instantly.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        # _FakeChromaCollection doesn't implement query/get/count; we only
+        # need to confirm the wrapper does not call into mine_palace_lock
+        # for reads, which we assert by observing the wrapped methods are
+        # NOT in ChromaCollection's _write_lock path. A direct check via
+        # source inspection is more honest than mocking the entire chroma
+        # surface here.
+        import inspect
+
+        from mempalace.backends.chroma import ChromaCollection as _CC
+
+        for write_attr in ("add", "upsert", "update", "delete"):
+            src = inspect.getsource(getattr(_CC, write_attr))
+            assert "_write_lock" in src, f"{write_attr} should acquire write lock"
+
+        for read_attr in ("query", "get", "count"):
+            method = getattr(_CC, read_attr, None)
+            if method is None:
+                continue
+            src = inspect.getsource(method)
+            assert (
+                "_write_lock" not in src
+            ), f"{read_attr} must NOT acquire the write lock (read path)"
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
@@ -4,7 +4,7 @@ import argparse
 import shlex
 import sys
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch

 import pytest

@@ -776,6 +776,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
    palace_dir.mkdir()
    (palace_dir / "chroma.sqlite3").write_text("db")
    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
    args = argparse.Namespace(palace=None)
    mock_backend = MagicMock()
    mock_backend.get_collection.side_effect = Exception("corrupt db")
@@ -791,6 +792,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
    palace_dir.mkdir()
    (palace_dir / "chroma.sqlite3").write_text("db")
    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
    args = argparse.Namespace(palace=None)
    mock_col = MagicMock()
    mock_col.count.return_value = 0
@@ -807,6 +809,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
    palace_dir.mkdir()
    (palace_dir / "chroma.sqlite3").write_text("db")
    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
    args = argparse.Namespace(palace=None, yes=True)
    mock_col = MagicMock()
    mock_col.count.return_value = 2
@@ -815,13 +818,98 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
        "documents": ["doc1", "doc2"],
        "metadatas": [{"wing": "a"}, {"wing": "b"}],
    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
    mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
        cmd_repair(args)
    out = capsys.readouterr().out
    assert "Repair complete" in out
    assert "2 drawers rebuilt" in out
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]
+    mock_temp_col.upsert.assert_called_once()
+    mock_new_col.upsert.assert_called_once()
+    mock_new_col.add.assert_not_called()
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "custom_drawers"
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
+        cmd_repair(args)
+
+    out = capsys.readouterr().out
+    assert "Repair complete" in out
+    mock_backend.get_collection.assert_called_once_with(str(palace_dir), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    out = capsys.readouterr().out
+    assert excinfo.value.code == 1
+    assert "Repair failed" in out
+    assert "restoring from backup" in out
+    mock_backend.close_palace.assert_called_once_with(str(palace_dir))
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]


@patch("mempalace.cli.MempalaceConfig")
@@ -830,6 +918,7 @@ def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsy
    palace_dir.mkdir()
    (palace_dir / "chroma.sqlite3").write_text("db")
    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
    args = argparse.Namespace(palace=None)
    mock_col = MagicMock()
    mock_col.count.return_value = 1
@@ -1042,3 +1131,119 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
    palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
    backup_path = palace_path + ".backup"
    assert not backup_path.startswith(palace_path + os.sep)
+
+
+# ── stdio reconfigure on Windows ─────────────────────────────────────
+
+
+class _ReconfigurableStringIO:
+    def __init__(self):
+        self.reconfigure_calls = []
+
+    def reconfigure(self, **kwargs):
+        self.reconfigure_calls.append(kwargs)
+
+
+def test_reconfigures_stdio_to_utf8_on_windows():
+    """Windows `mempalace` CLI must decode/encode stdio as UTF-8.
+
+    Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
+    or piped non-ASCII output (`mempalace search "..." > out.txt`) is
+    mojibaked through the system ANSI codepage on non-Latin Windows
+    locales (cp1252/cp1251/cp950).
+    """
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    stdout = _ReconfigurableStringIO()
+    stderr = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "win32"),
+        patch.object(sys, "stdin", stdin),
+        patch.object(sys, "stdout", stdout),
+        patch.object(sys, "stderr", stderr),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    # Per-stream errors policy: stdin survives bad bytes via
+    # surrogateescape so a redirected non-UTF-8 file does not crash
+    # the read; stdout/stderr use replace so a drawer carrying a
+    # round-tripped surrogate half does not crash mid-print.
+    assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+    assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+    assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+
+
+def test_reconfigure_stdio_is_noop_off_windows():
+    """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "linux"),
+        patch.object(sys, "stdin", stdin),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    assert stdin.reconfigure_calls == []
+
+
+# ── cmd_repair: from-sqlite mode exit codes ──────────────────────────
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys):
+    """When ``rebuild_from_sqlite`` returns ``{}`` for a validation
+    refusal (missing source DB, in-place without --archive-existing,
+    refusing to overwrite an existing dest), the CLI must surface a
+    non-zero exit so unattended scripts and CI distinguish "invalid
+    inputs" from "successful recovery that found zero rows."
+
+    Catches: a regression where the CLI treats the validation-refusal
+    sentinel as success, leaving CI green on a no-op repair that should
+    have alerted an operator.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value={}):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    assert excinfo.value.code == 1
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path):
+    """A successful from-sqlite rebuild — even one that finds zero rows
+    in a legitimately empty source palace — must NOT call ``sys.exit``.
+    A populated counts dict (with ``0`` values) is the success signal;
+    only the empty dict ``{}`` is reserved for validation refusal.
+
+    Catches: a regression where ``if not counts`` is replaced by
+    ``if not sum(counts.values())`` or similar, conflating "empty source"
+    with "validation refused" and breaking idempotent recovery scripts.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    # Zero rows but per-collection keys present → success, no exit.
+    fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0}
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts):
+        # Should return cleanly; no SystemExit raised.
+        cmd_repair(args)
@@ -296,6 +296,182 @@ class TestRegenerateClosets:
            assert meta.get("generated_by", "").startswith("llm:")
            assert meta.get("normalize_version") == NORMALIZE_VERSION

+    def test_regen_paginates_drawer_fetch(self, tmp_path):
+        """Regression for #1073: drawers_col.get must be paginated at
+        batch_size=5000. A single get(limit=total, ...) on a palace with
+        more than SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) drawers
+        blows up inside chromadb. Matches the miner.status pattern
+        introduced in #851 (see #802, #850, #1073)."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # Build a fake collection: 12_000 drawers across 3 source files,
+        # enough to force 3 batches of batch_size=5000 (5000 + 5000 + 2000).
+        n_drawers = 12_000
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"doc body {i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 3}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        get_calls: list = []
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                get_calls.append({"limit": limit, "offset": offset, "include": include})
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            """Accept the purge + upsert calls the success path makes."""
+
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        fake_drawers = FakeDrawersCol()
+        fake_closets = FakeClosetsCol()
+
+        def fake_urlopen(req, timeout=None):
+            return _FakeResp(
+                {
+                    "choices": [
+                        {"message": {"content": '{"topics":["t1"],"quotes":[],"summary":""}'}}
+                    ],
+                    "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                }
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=fake_drawers),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=fake_closets),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+        ):
+            result = regenerate_closets(palace, cfg=cfg, dry_run=True)
+
+        # Three paginated calls: (limit=5000, offset=0), (5000, 5000), (5000, 10000).
+        assert len(get_calls) == 3, f"expected 3 batched fetches, got {len(get_calls)}"
+        for call in get_calls:
+            assert (
+                call["limit"] == 5000
+            ), f"batch must be 5000 — got {call['limit']} (would risk SQLITE_MAX_VARIABLE_NUMBER)"
+            # include must still request both documents and metadatas
+            assert "documents" in call["include"]
+            assert "metadatas" in call["include"]
+        assert [c["offset"] for c in get_calls] == [0, 5000, 10_000]
+
+        # by_source aggregation must be preserved exactly across batches:
+        # 12_000 drawers, 3 source files → 4_000 drawers each.
+        # dry_run=True short-circuits LLM calls but still walks by_source.
+        assert result.get("processed", 0) == 0  # dry_run
+        # Verify no single call tried to pull more than batch_size.
+        assert max(c["limit"] for c in get_calls) <= 5000
+
+    def test_regen_by_source_aggregates_across_batches(self, tmp_path):
+        """Pagination must not change the by_source grouping — drawers for
+        the same source_file split across batches still land in one group."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # 7_500 drawers, alternating between two source files → forces
+        # splits across the 5000/2500 boundary. Each source ends up with
+        # 3_750 drawers after regrouping.
+        n_drawers = 7_500
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"body-{i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 2}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        captured_sources: dict = {}
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        # Hook _call_llm to inspect what regenerate_closets aggregated
+        # per source before the HTTP boundary.
+        real_call_llm = closet_llm_mod._call_llm
+
+        def spying_call_llm(cfg, source_file, wing, room, content):
+            captured_sources[source_file] = content
+            return (
+                {"topics": ["t"], "quotes": [], "summary": ""},
+                {"prompt_tokens": 1, "completion_tokens": 1},
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=FakeDrawersCol()),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=FakeClosetsCol()),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch.object(closet_llm_mod, "_call_llm", side_effect=spying_call_llm),
+        ):
+            regenerate_closets(palace, cfg=cfg)
+
+        # Both sources survived the pagination boundary.
+        assert set(captured_sources.keys()) == {"/src/file_0.md", "/src/file_1.md"}
+        # Each source accumulated exactly 3_750 drawer bodies, concatenated
+        # with the "\n\n" separator the regenerate path uses.
+        for source, content in captured_sources.items():
+            assert content.count("\n\n") == 3_749, (
+                f"{source}: expected 3_750 chunks joined (3_749 separators), "
+                f"got {content.count(chr(10) + chr(10)) + 1}"
+            )
+
+        # Silence unused-var lint.
+        assert real_call_llm is not None
+
    def test_regen_uses_basename_not_split_slash(self, tmp_path, monkeypatch):
        """Regression: the old closet_id base used ``source.split('/')[-1]``
        which silently degrades on Windows paths (``C:\\proj\\a.md`` →
@@ -3,7 +3,13 @@ import json
 import tempfile

 import pytest
-from mempalace.config import MempalaceConfig, normalize_wing_name, sanitize_kg_value, sanitize_name
+from mempalace.config import (
+    MempalaceConfig,
+    normalize_wing_name,
+    sanitize_iso_date,
+    sanitize_kg_value,
+    sanitize_name,
+)


 def test_default_config():
@@ -212,3 +218,69 @@ def test_kg_value_rejects_null_bytes():
 def test_kg_value_rejects_over_length():
    with pytest.raises(ValueError):
        sanitize_kg_value("a" * 129)
+
+
+# --- sanitize_iso_date ---
+
+
+def test_iso_date_rejects_year_only():
+    # Partial dates re-introduce silent empty result sets via lexicographic
+    # TEXT comparison in KG queries (e.g. "2026-01-01" <= "2026" is False).
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026")
+
+
+def test_iso_date_rejects_year_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-03")
+
+
+def test_iso_date_accepts_full_date():
+    assert sanitize_iso_date("2026-03-15") == "2026-03-15"
+
+
+def test_iso_date_passes_through_none():
+    assert sanitize_iso_date(None) is None
+
+
+def test_iso_date_passes_through_empty_string():
+    assert sanitize_iso_date("") == ""
+
+
+def test_iso_date_strips_whitespace():
+    assert sanitize_iso_date("  2026-03-15  ") == "2026-03-15"
+
+
+def test_iso_date_rejects_natural_language():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("March 2026")
+
+
+def test_iso_date_rejects_abbreviated_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("Jan 2025")
+
+
+def test_iso_date_rejects_us_format():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("03/15/2026")
+
+
+def test_iso_date_rejects_invalid_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-13")
+
+
+def test_iso_date_rejects_invalid_day():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-02-32")
+
+
+def test_iso_date_rejects_non_string():
+    with pytest.raises(ValueError):
+        sanitize_iso_date(20260315)
+
+
+def test_iso_date_error_names_field():
+    with pytest.raises(ValueError, match="valid_from"):
+        sanitize_iso_date("yesterday", "valid_from")
@@ -2,6 +2,8 @@

 from unittest.mock import patch

+import pytest
+
 from mempalace.entity_registry import (
    COMMON_ENGLISH_WORDS,
    PERSON_CONTEXT_PATTERNS,
@@ -71,6 +73,50 @@ def test_save_creates_file(tmp_path):
    assert (tmp_path / "entity_registry.json").exists()


+def test_save_is_atomic_does_not_leave_tmp(tmp_path):
+    # Atomic write must not leave the .tmp sidecar file after a successful save.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.save()
+    leftover = list(tmp_path.glob("entity_registry.json.tmp*"))
+    assert leftover == [], f"atomic write leaked tmp file(s): {leftover}"
+
+
+def test_save_preserves_previous_on_serialization_failure(tmp_path, monkeypatch):
+    # If serialization fails mid-write, the previous registry must remain
+    # intact — this is the whole point of atomic write vs truncating in place.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.seed(
+        mode="personal",
+        people=[{"name": "Alice", "relationship": "friend", "context": "personal"}],
+        projects=[],
+    )
+    registry.save()
+    target = tmp_path / "entity_registry.json"
+    original = target.read_text(encoding="utf-8")
+
+    # Force os.replace to raise — simulates filesystem full / permission flip
+    # AFTER the temp file is written but BEFORE the rename completes.
+    import os as _os
+
+    real_replace = _os.replace
+
+    def boom(src, dst):
+        raise OSError("simulated rename failure")
+
+    monkeypatch.setattr(_os, "replace", boom)
+    with pytest.raises(OSError):
+        registry.seed(
+            mode="personal",
+            people=[{"name": "Bob", "relationship": "friend", "context": "personal"}],
+            projects=[],
+        )
+        registry.save()
+
+    # Restore os.replace before reading so the assertion can rely on it.
+    monkeypatch.setattr(_os, "replace", real_replace)
+    assert target.read_text(encoding="utf-8") == original
+
+
 # ── seed ────────────────────────────────────────────────────────────────


@@ -286,3 +286,66 @@ class TestCLI:
        assert "similar_name" in out
        # Silence unused import warning.
        _ = (MagicMock, patch, fact_checker)
+
+    def test_reconfigures_stdio_to_utf8_on_windows(self):
+        """Windows fact_checker --stdin must decode payload as UTF-8.
+
+        Without this, Python defaults stdio to the system ANSI codepage
+        (cp1252/cp1251/cp950), which mojibakes non-ASCII text before
+        pattern parsing sees it.
+        """
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self, initial_value=""):
+                super().__init__(initial_value)
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        stdout = _ReconfigurableStringIO()
+        stderr = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "win32"),
+            patch.object(sys, "stdin", stdin),
+            patch.object(sys, "stdout", stdout),
+            patch.object(sys, "stderr", stderr),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        # Per-stream errors policy: stdin uses surrogateescape so a stray
+        # malformed byte from a redirected file does not crash the read,
+        # stdout/stderr use replace so an extracted fact carrying a
+        # surrogate half does not crash mid-print.
+        assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+        assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+        assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+
+    def test_reconfigure_stdio_is_noop_off_windows(self):
+        """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self):
+                super().__init__()
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "linux"),
+            patch.object(sys, "stdin", stdin),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        assert stdin.reconfigure_calls == []
@@ -238,14 +238,40 @@ def test_capacity_status_tolerates_flush_lag(tmp_path):
    assert info["status"] == "ok"


-def test_capacity_status_flags_unflushed_with_large_sqlite(tmp_path):
-    """No pickle + many sqlite rows is its own divergence signal."""
+def test_capacity_status_does_not_flag_unflushed_with_large_sqlite(tmp_path):
+    """No pickle + many sqlite rows is inconclusive, not divergence."""
    seg = "seg-noflush"
    _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
    info = hnsw_capacity_status(str(tmp_path), COLLECTION)
-    assert info["diverged"] is True
+    assert info["diverged"] is False
+    assert info["status"] == "unknown"
+    assert info["divergence"] is None
    assert info["hnsw_count"] is None
-    assert "never flushed" in info["message"]
+    assert "capacity unavailable" in info["message"]
+    assert "leaving vector search enabled" in info["message"]
+
+
+def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, monkeypatch):
+    """The MCP preflight must not route all searches to BM25 on this signal."""
+    from mempalace import mcp_server
+
+    seg = "seg-mcp-noflush"
+    _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
+
+    class _Cfg:
+        palace_path = str(tmp_path)
+        collection_name = "mempalace_drawers"
+
+    monkeypatch.setattr(mcp_server, "_config", _Cfg())
+    monkeypatch.setattr(mcp_server, "_vector_disabled", True)
+    monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "old divergence")
+
+    mcp_server._refresh_vector_disabled_flag()
+
+    assert mcp_server._vector_disabled is False
+    assert mcp_server._vector_disabled_reason == ""
+    assert mcp_server._vector_capacity_status["status"] == "unknown"
+    assert "leaving vector search enabled" in mcp_server._vector_capacity_status["message"]


 def test_capacity_status_quiet_for_empty_palace(tmp_path):
@@ -372,6 +398,17 @@ def _seed_drawers(palace: str, segment_id: str, drawers: list[tuple[str, dict, s
        conn.close()


+def _set_drawer_created_at(palace: str, timestamps: dict[int, str]) -> None:
+    db_path = os.path.join(palace, "chroma.sqlite3")
+    conn = sqlite3.connect(db_path)
+    try:
+        for emb_id, created_at in timestamps.items():
+            conn.execute("UPDATE embeddings SET created_at = ? WHERE id = ?", (created_at, emb_id))
+        conn.commit()
+    finally:
+        conn.close()
+
+
@pytest.fixture
 def palace_with_drawers(tmp_path):
    seg = "seg-bm25"
@@ -417,6 +454,122 @@ def test_bm25_fallback_filters_by_wing(palace_with_drawers):
    assert all(r["wing"] == "design" for r in out["results"])


+def test_bm25_fallback_applies_wing_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-fts-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+            (
+                "shared token inside target wing",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_applies_room_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-room-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token wrong room",
+                {"wing": "project", "room": "scratch", "source_file": "/x/scratch.md"},
+                "d-1",
+            ),
+            (
+                "shared token right room",
+                {"wing": "project", "room": "diary", "source_file": "/x/diary.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite(
+        "shared token",
+        str(tmp_path),
+        wing="project",
+        room="diary",
+        max_candidates=1,
+    )
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+    assert out["results"][0]["room"] == "diary"
+
+
+def test_bm25_fallback_applies_wing_before_recency_candidate_limit(tmp_path):
+    seg = "seg-bm25-recency-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "target drawer for short query",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-1",
+            ),
+            (
+                "newer drawer outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-2",
+            ),
+        ],
+    )
+    _set_drawer_created_at(
+        str(tmp_path),
+        {
+            1: "2026-01-01 00:00:00",
+            2: "2026-02-01 00:00:00",
+        },
+    )
+
+    out = _bm25_only_via_sqlite("a", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_returns_empty_when_filtered_wing_has_no_candidates(tmp_path):
+    seg = "seg-bm25-empty-filter"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 0
+    assert out["results"] == []
+
+
 def test_bm25_fallback_no_palace(tmp_path):
    out = _bm25_only_via_sqlite("anything", str(tmp_path))
    assert "error" in out
@@ -473,6 +626,7 @@ def test_tool_status_via_sqlite_returns_breakdown(palace_with_drawers, monkeypat
    # MempalaceConfig.
    class _Cfg:
        palace_path = str(palace_with_drawers)
+        collection_name = "mempalace_drawers"

    monkeypatch.setattr(mcp_server, "_config", _Cfg())
    monkeypatch.setattr(mcp_server, "_vector_disabled", True)
@@ -0,0 +1,113 @@
+import os
+from pathlib import Path
+
+from mempalace.backends.chroma import (
+    _HNSW_LINK_TO_DATA_MAX_RATIO,
+    _hnsw_link_to_data_ratio,
+    _segment_appears_healthy,
+    quarantine_stale_hnsw,
+)
+
+
+def _write_segment(
+    seg_dir: Path,
+    *,
+    data_size: int = 100,
+    link_size: int = 100,
+    write_metadata: bool = True,
+) -> None:
+    seg_dir.mkdir(parents=True, exist_ok=True)
+    (seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
+    (seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
+
+    if write_metadata:
+        # Enough bytes to pass the existing pickle envelope sniff-test:
+        # starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
+        (seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
+
+
+def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(seg_dir, data_size=100, link_size=250)
+
+    assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
+
+
+def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    assert not _segment_appears_healthy(str(seg_dir))
+
+
+def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
+        write_metadata=True,
+    )
+
+    assert _segment_appears_healthy(str(seg_dir))
+
+
+def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    # Make sqlite and HNSW mtimes identical. The old mtime-only gate would
+    # skip this segment even though the payload is structurally corrupt.
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert len(moved) == 1
+    assert not seg_dir.exists()
+
+    moved_path = Path(moved[0])
+    assert moved_path.exists()
+    assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
+
+
+def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=100,
+        write_metadata=True,
+    )
+
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert moved == []
+    assert seg_dir.exists()
@@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch

 import pytest

+import mempalace.hooks_cli as hooks_cli_mod
 from mempalace.hooks_cli import (
    SAVE_INTERVAL,
    _count_human_messages,
@@ -959,3 +960,108 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
    # The injected value is not "true"/"1"/"yes", so the hook should NOT pass through.
    # Save must have been attempted.
    assert mock_save.called
+
+
+# --- Absent palace root: hooks must not recreate ~/.mempalace ---
+#
+# When the user removes ~/.mempalace (e.g. `rm -rf`), that is the strongest
+# possible "do not auto-capture" signal. Hooks must short-circuit BEFORE
+# touching disk — including before the log-line that previously triggered
+# STATE_DIR.mkdir() on its own.
+
+
+def _redirect_palace_root(monkeypatch, tmp_path):
+    """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
+    fake_root = tmp_path / "absent-mempalace"
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    return fake_root
+
+
+def test_hook_stop_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_stop(
+            {"session_id": "absent", "transcript_path": str(transcript), "stop_hook_active": False},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_precompact_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_precompact(
+            {"session_id": "absent", "transcript_path": str(transcript)},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_session_start_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "absent"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_log_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    _log("test message")
+    assert not fake_root.exists()
+
+
+def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
+    """Regression: when PALACE_ROOT exists, hooks must proceed (no short-circuit)."""
+    fake_root = tmp_path / "present-mempalace"
+    fake_root.mkdir()
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    _log("test message")
+    # _log should have created the state dir under the existing palace root
+    assert (fake_root / "hook_state").exists()
+    assert (fake_root / "hook_state" / "hook.log").is_file()
+
+
+def test_regular_file_at_palace_root_treated_as_absent(tmp_path, monkeypatch):
+    """A regular file at ~/.mempalace must be treated the same as absent.
+
+    ``Path.exists()`` returns True for a regular file, which would let the
+    kill-switch be bypassed and crash later when ``STATE_DIR.mkdir()`` runs
+    on ``NotADirectoryError``. ``_palace_root_exists()`` must use
+    ``is_dir()`` so a stray file (or broken symlink) short-circuits cleanly.
+    """
+    fake_root = tmp_path / "file-not-dir"
+    fake_root.write_text("oops, this is a file not a directory")
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+
+    # _palace_root_exists() is the source of truth — it must return False.
+    assert hooks_cli_mod._palace_root_exists() is False
+
+    # Hooks must short-circuit (return {} on stdout) and not touch disk.
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "file-at-root"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+
+    # _log must also short-circuit — it must NOT try to mkdir a path under a
+    # regular file (which would raise NotADirectoryError).
+    _log("test message")  # would raise if not short-circuited
+
+    # The stray file is left untouched; we never try to convert it.
+    assert fake_root.is_file()
+    assert fake_root.read_text() == "oops, this is a file not a directory"
@@ -5,6 +5,8 @@ Covers: entity CRUD, triple CRUD, temporal queries, invalidation,
 timeline, stats, and edge cases (duplicate triples, ID collisions).
 """

+import pytest
+

 class TestEntityOperations:
    def test_add_entity(self, kg):
@@ -45,6 +47,38 @@ class TestTripleOperations:
        tid2 = kg.add_triple("Alice", "works_at", "Acme")
        assert tid1 != tid2  # new triple since old one was closed

+    def test_add_triple_rejects_inverted_interval(self, kg):
+        # valid_to before valid_from would never satisfy
+        # `valid_from <= as_of AND valid_to >= as_of` — silently invisible
+        # to every query. Reject at write time instead.
+        with pytest.raises(ValueError, match="before valid_from"):
+            kg.add_triple(
+                "Alice",
+                "worked_at",
+                "Acme",
+                valid_from="2026-03-01",
+                valid_to="2026-02-01",
+            )
+
+    def test_add_triple_accepts_equal_dates(self, kg):
+        # Same-day intervals are valid (point-in-time facts).
+        tid = kg.add_triple(
+            "Alice",
+            "joined",
+            "Acme",
+            valid_from="2026-03-15",
+            valid_to="2026-03-15",
+        )
+        assert tid.startswith("t_alice_joined_acme_")
+
+    def test_add_triple_allows_only_one_bound(self, kg):
+        # The guard only fires when BOTH bounds are set.
+        tid1 = kg.add_triple("Alice", "knows", "Bob", valid_from="2026-01-01")
+        assert tid1.startswith("t_alice_knows_bob_")
+        kg.invalidate("Alice", "knows", "Bob", ended="2026-02-01")
+        tid2 = kg.add_triple("Alice", "knew", "Bob", valid_to="2026-03-01")
+        assert tid2.startswith("t_alice_knew_bob_")
+

 class TestQueries:
    def test_query_outgoing(self, seeded_kg):
@@ -655,3 +655,72 @@ def test_memory_stack_status_with_palace(tmp_path):

    assert result["total_drawers"] == 42
    assert result["L0_identity"]["exists"] is True
+
+
+# ── Layer1 / Layer2 None-metadata guards ───────────────────────────────
+#
+# Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
+# lists for partially-flushed rows. The Layer1.generate() and
+# Layer2.retrieve() loops previously called ``meta.get(...)`` without
+# coercing, raising ``AttributeError: 'NoneType' object has no attribute
+# 'get'`` and blowing up the whole wake-up render. These tests guard that
+# the loops tolerate the None entries and render the rest of the result.
+
+
+def test_layer1_handles_none_metadata():
+    """Layer1.generate tolerates None entries in the metadatas list."""
+    docs = ["important memory", "another memory"]
+    metas = [{"room": "decisions", "source_file": "a.txt"}, None]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.generate()
+
+    assert "ESSENTIAL STORY" in result
+    assert "important memory" in result
+
+
+def test_layer1_handles_none_document():
+    """Layer1.generate tolerates None entries in the documents list."""
+    docs = ["first doc", None]
+    metas = [
+        {"room": "r", "source_file": "a.txt"},
+        {"room": "r", "source_file": "b.txt"},
+    ]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        result = layer.generate()
+
+    assert result  # Render succeeded despite the None document.
+
+
+def test_layer2_handles_none_metadata():
+    """Layer2.retrieve tolerates None entries in the metadatas list."""
+    mock_col = MagicMock()
+    mock_col.get.return_value = {
+        "documents": ["first doc", "second doc"],
+        "metadatas": [{"room": "r", "source_file": "a.txt"}, None],
+    }
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer2(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.retrieve()
+
+    assert "L2 — ON-DEMAND" in result
@@ -8,7 +8,9 @@ via monkeypatch to avoid touching real data.

 from datetime import datetime
 import json
+import os
 import sys
+from unittest.mock import MagicMock

 import pytest

@@ -18,7 +20,7 @@ def _patch_mcp_server(monkeypatch, config, kg):
    from mempalace import mcp_server

    monkeypatch.setattr(mcp_server, "_config", config)
-    monkeypatch.setattr(mcp_server, "_kg", kg)
+    monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)


 def _get_collection(palace_path, create=False):
@@ -146,6 +148,20 @@ class TestHandleRequest:
        )
        assert resp["error"]["code"] == -32601

+    def test_tools_call_missing_params(self):
+        from mempalace.mcp_server import handle_request
+
+        for bad_params in [None, {}, {"arguments": {}}]:
+            resp = handle_request(
+                {
+                    "method": "tools/call",
+                    "id": 15,
+                    "params": bad_params,
+                }
+            )
+            assert resp["error"]["code"] == -32602
+            assert "Invalid params" in resp["error"]["message"]
+
    def test_unknown_method(self):
        from mempalace.mcp_server import handle_request

@@ -188,6 +204,17 @@ class TestHandleRequest:
        resp = handle_request({"method": None, "id": 99, "params": {}})
        assert resp["error"]["code"] == -32601

+    @pytest.mark.parametrize("payload", [None, [], "plain", 42, True])
+    def test_handle_request_invalid_payload_returns_jsonrpc_error(self, payload):
+        from mempalace.mcp_server import handle_request
+
+        resp = handle_request(payload)
+        assert resp == {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
+
    def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
        _patch_mcp_server(monkeypatch, config, seeded_kg)
        from mempalace.mcp_server import handle_request
@@ -457,6 +484,26 @@ class TestWriteTools:
        assert result2["success"] is True
        assert result2["reason"] == "already_exists"

+    def test_add_drawer_fails_when_readback_misses(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        class _FakeGetResult:
+            ids = []
+
+        class _FakeCol:
+            def get(self, **kwargs):
+                return _FakeGetResult()
+
+            def upsert(self, **kwargs):
+                return None
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_add_drawer("w", "r", "content")
+        assert result["success"] is False
+        assert "not readable" in result["error"]
+
    def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace_path, kg):
        """Documents sharing a >100-char header must get distinct IDs (full-content hash)."""
        _patch_mcp_server(monkeypatch, config, kg)
@@ -495,6 +542,41 @@ class TestWriteTools:
        result = tool_delete_drawer("nonexistent_drawer")
        assert result["success"] is False

+    def test_check_duplicate_handles_none_metadata(self, monkeypatch, config, kg):
+        """tool_check_duplicate must tolerate None entries in the result lists
+        that ChromaDB 1.5.x returns for partially-flushed rows.
+
+        Previously ``meta = results["metadatas"][0][i]`` was unguarded and
+        raised ``AttributeError: 'NoneType' object has no attribute 'get'``
+        the moment the first matching drawer came back with None metadata —
+        surfacing to the MCP client as the uninformative
+        ``"Duplicate check failed"`` because the broad ``except Exception``
+        wrapper swallows the real cause.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "ids": [["d1", "d2"]],
+            "distances": [[0.05, 0.05]],
+            "metadatas": [[{"wing": "w", "room": "r"}, None]],
+            "documents": [["first doc", None]],
+        }
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: mock_col)
+
+        result = mcp_server.tool_check_duplicate("any content", threshold=0.5)
+
+        # Both entries land in matches (above threshold), None ones rendered
+        # with sentinel values rather than crashing the whole response.
+        assert result.get("is_duplicate") is True
+        assert len(result["matches"]) == 2
+        # The None-metadata entry falls back to sentinels.
+        none_entry = result["matches"][1]
+        assert none_entry["wing"] == "?"
+        assert none_entry["room"] == "?"
+        assert none_entry["content"] == ""
+
    def test_check_duplicate(self, monkeypatch, config, palace_path, seeded_collection, kg):
        _patch_mcp_server(monkeypatch, config, kg)
        from mempalace.mcp_server import tool_check_duplicate
@@ -788,6 +870,59 @@ class TestKGTools:
        result = tool_kg_stats()
        assert result["entities"] >= 4

+    # --- Date validation at the MCP boundary (issue #1164) ---
+
+    def test_kg_add_rejects_invalid_valid_from(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="Alice",
+            predicate="likes",
+            object="coffee",
+            valid_from="Jan 2025",
+        )
+        assert result["success"] is False
+        assert "valid_from" in result["error"]
+        assert "ISO-8601" in result["error"]
+
+    def test_kg_query_rejects_invalid_as_of(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        result = tool_kg_query(entity="Max", as_of="March 2026")
+        assert "error" in result
+        assert "as_of" in result["error"]
+
+    def test_kg_invalidate_rejects_invalid_ended(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        result = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="chess",
+            ended="yesterday",
+        )
+        assert result["success"] is False
+        assert "ended" in result["error"]
+
+    def test_kg_query_rejects_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        # Partial ISO dates are rejected: KG queries compare TEXT dates
+        # lexicographically, so "2026-01-01" <= "2026" is False, which
+        # silently excludes facts. Reject at the boundary — only YYYY-MM-DD
+        # produces correct results.
+        for value in ("2026", "2026-03"):
+            result = tool_kg_query(entity="Max", as_of=value)
+            assert "error" in result, f"accepted partial date {value!r}: {result}"
+
+        # Full ISO-8601 dates still pass.
+        result = tool_kg_query(entity="Max", as_of="2026-03-15")
+        assert "error" not in result, f"rejected valid date: {result}"
+

 # ── Diary Tools ─────────────────────────────────────────────────────────

@@ -1043,6 +1178,25 @@ class TestCacheInvalidation:
        assert "Reconnected" in result["message"]
        assert isinstance(result["drawers"], int)

+    def test_reconnect_closes_shared_backend(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from unittest.mock import MagicMock
+
+        from mempalace import mcp_server, palace
+
+        close_palace = MagicMock()
+        monkeypatch.setattr(palace._DEFAULT_BACKEND, "close_palace", close_palace)
+
+        class _FakeCol:
+            def count(self):
+                return 7
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_reconnect()
+        assert result["success"] is True
+        close_palace.assert_called_once_with(config.palace_path)
+
    def test_get_collection_create_true_avoids_get_or_create_on_reopen(
        self, monkeypatch, config, palace_path, kg
    ):
@@ -1143,3 +1297,296 @@ class TestCacheInvalidation:
        for kwargs in captured["get"]:
            assert "embedding_function" in kwargs
            assert kwargs["embedding_function"] is not None
+
+    def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
+        """Regression: a transient failure inside _get_collection must trigger
+        one retry after clearing the client/collection caches, not silently
+        return None.
+
+        Before this fix, a stale chromadb handle (e.g. the rust bindings
+        invalidating after an out-of-band write) would raise inside the
+        single ``try`` block, get swallowed by ``except Exception: return
+        None``, and every subsequent tool call would hit the same poisoned
+        cache returning None. The retry forces ``_get_client()`` to rebuild
+        the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
+        the second attempt heals the common stale-handle case.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        # Force a cold cache so the first call goes through the open path.
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        real_get_client = mcp_server._get_client
+        attempts = {"count": 0}
+
+        def flaky_get_client():
+            attempts["count"] += 1
+            if attempts["count"] == 1:
+                raise RuntimeError("simulated transient chromadb failure")
+            return real_get_client()
+
+        monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
+
+        col = mcp_server._get_collection()
+
+        # Both attempts ran and the second succeeded.
+        assert attempts["count"] == 2
+        assert col is not None
+
+    def test_get_collection_returns_none_after_two_failures(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        """If both attempts fail, return None (matches the prior contract for
+        permanent failures — only the transient case is now self-healing)."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        attempts = {"count": 0}
+
+        def always_fails():
+            attempts["count"] += 1
+            raise RuntimeError("permanent chromadb failure")
+
+        monkeypatch.setattr(mcp_server, "_get_client", always_fails)
+
+        col = mcp_server._get_collection()
+
+        assert attempts["count"] == 2
+        assert col is None
+
+
+class TestKGLazyCache:
+    """Lazy per-path KnowledgeGraph cache (issue #1136)."""
+
+    def test_lazy_init_no_import_side_effect(self, tmp_path):
+        """Importing mcp_server must not create knowledge_graph.sqlite3.
+
+        Runs in a fresh subprocess with HOME pointed at tmp_path so the
+        assertion targets a clean filesystem, independent of conftest's
+        session-level HOME patch.
+        """
+        import subprocess
+        import sys
+
+        kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        env = {k: v for k, v in os.environ.items() if not k.startswith("MEMPAL")}
+        env["HOME"] = str(tmp_path)
+        env["USERPROFILE"] = str(tmp_path)
+        result = subprocess.run(
+            [sys.executable, "-c", "import mempalace.mcp_server"],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"import failed: {result.stderr}"
+        assert not kg_file.exists(), f"import created sqlite file at {kg_file} as a side effect"
+
+    def test_get_kg_returns_same_instance(self, tmp_path, monkeypatch):
+        """Two calls with the same resolved path return the same KG."""
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        kg1 = mcp_server._get_kg()
+        kg2 = mcp_server._get_kg()
+        assert kg1 is kg2
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_get_kg_different_paths_different_instances(self, tmp_path, monkeypatch):
+        """Different palace paths map to different KG instances."""
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "a"
+        tmp_b = tmp_path / "b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        kg_a = mcp_server._get_kg()
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        kg_b = mcp_server._get_kg()
+
+        assert kg_a is not kg_b
+        assert len(mcp_server._kg_by_path) == 2
+
+    def test_multi_tenant_env_switch(self, tmp_path, monkeypatch):
+        """The issue #1136 acceptance scenario.
+
+        Rotating MEMPALACE_PALACE_PATH between MCP tool calls must route
+        each call to the correct tenant's KG sqlite file.
+        """
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "tenant_a"
+        tmp_b = tmp_path / "tenant_b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        add_result = mcp_server.tool_kg_add(
+            subject="alice_secret",
+            predicate="owns",
+            object="repo_a",
+        )
+        assert add_result.get("success") is True, add_result
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        query_b = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_b.get("count", 0) == 0, f"tenant B leaked tenant A's fact: {query_b}"
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        query_a = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_a.get("count", 0) >= 1, f"tenant A lost its own fact: {query_a}"
+
+    def test_cache_thread_safe(self, tmp_path, monkeypatch):
+        """Concurrent _get_kg() for the same path yields one instance."""
+        import concurrent.futures
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as pool:
+            results = list(pool.map(lambda _: mcp_server._get_kg(), range(16)))
+
+        ids = {id(kg) for kg in results}
+        assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_tool_reconnect_drains_kg_cache(self, monkeypatch):
+        """``tool_reconnect`` must close cached KG instances and clear the dict.
+
+        Without this, an external replacement of ``knowledge_graph.sqlite3``
+        leaves the server pinned to a stale ``sqlite3.Connection``.
+        """
+        from mempalace import mcp_server
+
+        class _FakeKG:
+            def __init__(self):
+                self.closed = False
+
+            def close(self):
+                self.closed = True
+
+        fake_a = _FakeKG()
+        fake_b = _FakeKG()
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": fake_a, "/b": fake_b})
+        # Bypass real ChromaDB so the test isolates KG-cache behaviour.
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert fake_a.closed is True
+        assert fake_b.closed is True
+        assert mcp_server._kg_by_path == {}
+
+    def test_tool_reconnect_swallows_kg_close_errors(self, monkeypatch):
+        """A failing ``close()`` on one cached KG must not block cache clearing."""
+        from mempalace import mcp_server
+
+        class _BoomKG:
+            def close(self):
+                raise RuntimeError("boom")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": _BoomKG()})
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert mcp_server._kg_by_path == {}
+
+    def test_call_kg_retries_after_concurrent_close(self, monkeypatch):
+        """A KG closed mid-handler must trigger a one-shot retry with a fresh
+        instance — not surface a -32000 to the MCP client."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        class _ClosedKG:
+            def query_entity(self, entity, **kwargs):
+                raise _sqlite3.ProgrammingError("Cannot operate on a closed database")
+
+        class _FreshKG:
+            def query_entity(self, entity, **kwargs):
+                return [{"entity": entity}]
+
+        cache = {os.path.abspath(path): _ClosedKG()}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+
+        # Second _get_kg() call (after the cache eviction) constructs a new
+        # KG. Patch the constructor so we don't open a real sqlite file.
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FreshKG())
+
+        result = mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert result == [{"entity": "Alice"}]
+        # The closed instance must be evicted; the fresh one must be cached.
+        assert isinstance(cache[os.path.abspath(path)], _FreshKG)
+
+    def test_call_kg_does_not_retry_on_other_errors(self, monkeypatch):
+        """Non-ProgrammingError exceptions must propagate without retry —
+        we don't want the retry guard masking real bugs."""
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _FailingKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise ValueError("bad input")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {os.path.abspath(path): _FailingKG()})
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FailingKG())
+
+        with pytest.raises(ValueError, match="bad input"):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 1, "non-ProgrammingError must not trigger retry"
+
+    def test_call_kg_gives_up_after_one_retry(self, monkeypatch):
+        """If the second attempt also hits a closed DB, give up rather than
+        loop forever — a sustained close-stream is a different bug."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _AlwaysClosedKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise _sqlite3.ProgrammingError("closed again")
+
+        cache = {}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _AlwaysClosedKG())
+
+        with pytest.raises(_sqlite3.ProgrammingError):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 2, "expected exactly one retry beyond the initial attempt"
@@ -4,7 +4,7 @@ import os
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch

-from mempalace.migrate import _restore_stale_palace, migrate
+from mempalace.migrate import collection_write_roundtrip_works, _restore_stale_palace, migrate


 def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -101,3 +101,102 @@ def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
    assert "CRITICAL" in out
    assert os.fspath(palace_path) in out
    assert os.fspath(stale_path) in out
+
+
+class _FakeGetResult:
+    def __init__(self, ids):
+        self.ids = ids
+
+
+class _WritableFakeCollection:
+    def __init__(self):
+        self.ids = set()
+        self.deleted = []
+
+    def upsert(self, *, ids, documents, metadatas):
+        self.ids.update(ids)
+
+    def get(self, *, ids, include=None):
+        return _FakeGetResult([drawer_id for drawer_id in ids if drawer_id in self.ids])
+
+    def delete(self, *, ids=None, where=None):
+        for drawer_id in ids or []:
+            self.ids.discard(drawer_id)
+            self.deleted.append(drawer_id)
+
+
+class _SilentWriteDropCollection(_WritableFakeCollection):
+    def upsert(self, *, ids, documents, metadatas):
+        return None
+
+
+class _SilentDeleteDropCollection(_WritableFakeCollection):
+    def delete(self, *, ids=None, where=None):
+        self.deleted.extend(ids or [])
+
+
+def test_collection_write_roundtrip_works_when_probe_persists_and_deletes():
+    col = _WritableFakeCollection()
+
+    assert collection_write_roundtrip_works(col) is True
+    assert col.ids == set()
+    assert len(col.deleted) == 1
+
+
+def test_collection_write_roundtrip_fails_when_upsert_silently_drops():
+    col = _SilentWriteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert col.ids == set()
+
+
+def test_collection_write_roundtrip_fails_when_delete_silently_drops():
+    col = _SilentDeleteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert len(col.ids) == 1
+
+
+def test_migrate_dry_run_rebuilds_when_collection_is_readable_but_not_writable(tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+
+    fake_col = MagicMock()
+    fake_col.count.return_value = 102
+
+    drawers = [
+        {
+            "id": "id1",
+            "document": "hello",
+            "metadata": {"wing": "test-wing", "room": "general"},
+        }
+    ]
+
+    with (
+        patch("mempalace.migrate.detect_chromadb_version", return_value="1.x"),
+        patch("mempalace.backends.chroma.ChromaBackend") as mock_backend,
+        patch(
+            "mempalace.migrate.collection_write_roundtrip_works", return_value=False
+        ) as mock_probe,
+        patch(
+            "mempalace.migrate.extract_drawers_from_sqlite", return_value=drawers
+        ) as mock_extract,
+    ):
+        mock_backend.backend_version.return_value = "1.5.8"
+        mock_backend.return_value.get_collection.return_value = fake_col
+
+        result = migrate(str(palace_dir), dry_run=True)
+
+    out = capsys.readouterr().out
+
+    assert result is True
+    mock_probe.assert_called_once_with(fake_col)
+    mock_extract.assert_called_once_with(
+        os.path.join(os.path.abspath(os.fspath(palace_dir)), "chroma.sqlite3")
+    )
+
+    assert "readable by chromadb 1.5.8, but write/delete verification failed" in out
+    assert "Rebuilding from SQLite" in out
+    assert "Extracted 1 drawers from SQLite" in out
+    assert "DRY RUN" in out
@@ -699,6 +699,83 @@ def test_mine_keyboard_interrupt_quotes_path_with_spaces_in_resume_hint(tmp_path
    assert f"mempalace mine {shlex.quote(str(project_root))}" in out


+def test_skip_filenames_includes_lockfiles():
+    """pnpm-lock.yaml and yarn.lock must be skipped alongside package-lock.json
+    so a Windows mine over a typical JS monorepo doesn't OOM the ONNX embedder
+    on a 24K-line lockfile (#1296)."""
+    from mempalace import miner
+
+    assert "package-lock.json" in miner.SKIP_FILENAMES
+    assert "pnpm-lock.yaml" in miner.SKIP_FILENAMES
+    assert "yarn.lock" in miner.SKIP_FILENAMES
+
+
+def test_process_file_skips_when_chunks_exceed_max(tmp_path, monkeypatch):
+    """A file producing more than MAX_CHUNKS_PER_FILE chunks must be skipped
+    with a clear message and zero upserts. Generated artifacts (CSVs, lock
+    files not in SKIP_FILENAMES) hit this — the cap is what prevents ONNX
+    bad_alloc on Windows when the embedder is asked to swallow thousands of
+    chunks in one batch (#1296)."""
+    from unittest.mock import MagicMock
+
+    from mempalace import miner
+
+    monkeypatch.setattr(miner, "MAX_CHUNKS_PER_FILE", 5)
+    over_cap = [{"content": f"chunk {i}", "chunk_index": i} for i in range(7)]
+    monkeypatch.setattr(miner, "chunk_text", lambda content, source_file: over_cap)
+
+    source = tmp_path / "huge.csv"
+    source.write_text("col1,col2\n" + "x,y\n" * 500, encoding="utf-8")
+    col = MagicMock()
+    col.get.return_value = {"ids": []}
+
+    drawers, room = miner.process_file(
+        source,
+        tmp_path,
+        col,
+        "wing",
+        [{"name": "general", "description": "General"}],
+        "agent",
+        False,
+    )
+
+    assert drawers == 0
+    col.upsert.assert_not_called()
+
+
+def test_mine_arbitrary_exception_prints_summary_and_reraises(tmp_path, capsys):
+    """A non-KeyboardInterrupt exception mid-mine must surface a summary
+    banner before propagating, so users don't see a silent exit-0 with no
+    completion message (#1296 Failure 2). Re-raise preserves the traceback
+    and yields a non-zero exit code."""
+    import pytest
+    from unittest.mock import patch
+
+    project_root = tmp_path / "proj"
+    project_root.mkdir()
+    _make_minable_project(project_root, n_files=4)
+    palace_path = project_root / "palace"
+
+    call_count = {"n": 0}
+
+    def fake_process_file(*args, **kwargs):
+        call_count["n"] += 1
+        if call_count["n"] == 2:
+            raise RuntimeError("simulated ONNX bad_alloc")
+        return (1, "general")
+
+    with patch("mempalace.miner.process_file", side_effect=fake_process_file):
+        with pytest.raises(RuntimeError, match="simulated ONNX bad_alloc"):
+            mine(str(project_root), str(palace_path))
+
+    out = capsys.readouterr().out
+    assert "Mine aborted by exception." in out
+    assert "files_processed: 1/" in out
+    assert "drawers_filed:" in out
+    assert "RuntimeError: simulated ONNX bad_alloc" in out
+    assert "upserted idempotently" in out
+
+
 def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
    """Our own PID entry in mine.pid is removed in the finally clause."""
    import pytest
@@ -135,19 +135,77 @@ def test_different_palaces_dont_conflict(tmp_path, monkeypatch):


 def test_palace_path_is_normalized(tmp_path, monkeypatch):
-    """Relative and absolute forms of the same path must use the same lock."""
+    """Relative and absolute forms of the same path must use the same lock.
+
+    Cross-process variant: a child holds the absolute form, a relative form
+    in the parent must hash to the same lock key and raise
+    ``MineAlreadyRunning``. (The same-thread case is now a re-entrant
+    pass-through by design — see ``test_reentrant_same_thread_passes_through``
+    — so we exercise the normalization invariant across a process boundary
+    where re-entrance does not apply.)
+    """
    monkeypatch.setenv("HOME", str(tmp_path))
    monkeypatch.chdir(tmp_path)
    os.makedirs(tmp_path / "palace", exist_ok=True)
    absolute = str(tmp_path / "palace")
-    relative = "palace"
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")

-    # Hold the lock with the absolute form; attempting to re-acquire with
-    # the relative form (which resolves to the same absolute path) must fail.
-    with mine_palace_lock(absolute):
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(absolute, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+        # Parent holds CWD = tmp_path so "palace" is the same on-disk dir as
+        # the absolute form. The lock key is sha256(realpath+normcase) so the
+        # two forms must collide.
        with pytest.raises(MineAlreadyRunning):
-            with mine_palace_lock(relative):
+            with mine_palace_lock("palace"):
                pytest.fail("normalized path collision should have raised")
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
+    """Same thread re-acquiring the same palace lock must not deadlock or raise.
+
+    This is the invariant that makes ``ChromaCollection`` write methods (which
+    take ``mine_palace_lock`` for MCP/direct-writer protection) compose with
+    ``miner.mine()`` (which already holds the lock for the entire mine
+    pipeline). Without the per-thread re-entrant guard the inner acquire
+    would self-deadlock on the outer flock.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    with mine_palace_lock(palace):
+        # Re-enter from the same thread — must yield without raising or hanging.
+        with mine_palace_lock(palace):
+            pass
+        # After the inner exits, the outer is still held: confirm via a
+        # subprocess that tries to acquire and reports back.
+        ctx = _get_mp_context()
+        result_q = ctx.Queue()
+        child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
+        child.start()
+        child.join(timeout=5)
+        assert (
+            result_q.get(timeout=1) == "busy"
+        ), "outer lock should still be held by parent after inner re-entrant exit"
+
+
+def _try_acquire_expect_busy(palace_path, result_q):
+    """Helper: try to acquire, push 'busy' (raised) or 'free' (acquired) into queue."""
+    try:
+        with mine_palace_lock(palace_path):
+            result_q.put("free")
+    except MineAlreadyRunning:
+        result_q.put("busy")


 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):
@@ -2,7 +2,7 @@

 import os
 import sqlite3
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch

 import pytest

@@ -28,6 +28,16 @@ def test_get_palace_path_fallback():
        assert ".mempalace" in result


+def test_get_collection_name_from_config():
+    from mempalace.config import get_configured_collection_name
+
+    get_configured_collection_name.cache_clear()
+    with patch("mempalace.config.MempalaceConfig") as mock_config_cls:
+        mock_config_cls.return_value.collection_name = "custom_drawers"
+        assert repair._drawers_collection_name() == "custom_drawers"
+    get_configured_collection_name.cache_clear()
+
+
 # ── _paginate_ids ─────────────────────────────────────────────────────


@@ -229,8 +239,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
    }

    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]

    repair.rebuild_index(palace_path=str(tmp_path))

@@ -239,14 +252,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
    assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)

    # Verify: deleted and recreated (cosine is the backend default)
-    mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
-    mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]

    # Verify: used upsert not add
+    mock_temp_col.upsert.assert_called_once()
    mock_new_col.upsert.assert_called_once()
    mock_new_col.add.assert_not_called()


+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_missing_temp_collection_at_start(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"),
+        None,
+        None,
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+def test_delete_collection_if_exists_reraises_unexpected_value_error():
+    mock_backend = MagicMock()
+    mock_backend.delete_collection.side_effect = ValueError("invalid collection name")
+
+    with pytest.raises(ValueError, match="invalid collection name"):
+        repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name")
+
+
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
@@ -267,6 +340,21 @@ def test_check_extraction_safety_passes_when_counts_match(tmp_path):
        repair.check_extraction_safety(str(tmp_path), 500)


+def test_check_extraction_safety_uses_configured_collection(tmp_path):
+    with patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count:
+        repair.check_extraction_safety(str(tmp_path), 500, collection_name="custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
+def test_check_extraction_safety_default_uses_configured_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count,
+    ):
+        repair.check_extraction_safety(str(tmp_path), 500)
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
 def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
    """SQLite check fails (None) but extraction is well under the cap → safe."""
    with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
@@ -321,6 +409,73 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
    assert repair.sqlite_drawer_count(str(tmp_path)) is None


+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=2) as count,
+    ):
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    mock_backend.get_collection.assert_called_once_with(str(tmp_path), "custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+    ]
+
+
+def test_status_default_uses_configured_drawer_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.hnsw_capacity_status") as capacity_status,
+    ):
+        capacity_status.side_effect = [
+            {
+                "sqlite_count": 1,
+                "hnsw_count": 1,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+            {
+                "sqlite_count": 0,
+                "hnsw_count": 0,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+        ]
+        repair.status(palace_path=str(tmp_path))
+
+    assert capacity_status.call_args_list[0].args == (str(tmp_path), "custom_drawers")
+    assert capacity_status.call_args_list[1].args == (str(tmp_path), "mempalace_closets")
+
+
@patch("mempalace.repair.shutil")
@patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
@@ -365,19 +520,261 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp
        },
        {"ids": [], "documents": [], "metadatas": []},
    ]
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 10_000
    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 10_000
    mock_backend.get_collection.return_value = mock_col
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
    mock_backend_cls.return_value = mock_backend

    with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
        repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)

-    mock_backend.delete_collection.assert_called_once()
-    mock_backend.create_collection.assert_called_once()
+    assert mock_backend.delete_collection.call_count == 3
+    assert mock_backend.create_collection.call_count == 2
+    mock_temp_col.upsert.assert_called()
    mock_new_col.upsert.assert_called()


+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 1
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.return_value = mock_temp_col
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is False
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    active_backend = MagicMock()
+    active_backend.get_collection.return_value = mock_col
+    active_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    helper_backend = MagicMock()
+    mock_backend_cls.side_effect = [active_backend, helper_backend]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert active_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+    active_backend.close_palace.assert_called_once_with(str(tmp_path))
+    helper_backend.close_palace.assert_not_called()
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_delete_missing_still_restores_backup(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        None,
+        repair.ChromaNotFoundError("missing"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_restore_failure_preserves_original_error(
+    mock_backend_cls, mock_shutil, tmp_path, capsys
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _copy2_side_effect(src, dst):
+        if str(src).endswith(".backup"):
+            raise PermissionError("locked sqlite")
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _copy2_side_effect
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    out = capsys.readouterr().out
+    assert "locked sqlite" in out
+    assert "Manual restore required" in out
+    assert "live upsert failed" in str(excinfo.value)
+
+
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails(
+    mock_backend_cls,
+):
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair._rebuild_collection_via_temp(
+            mock_backend,
+            "/palace",
+            ["id1", "id2"],
+            ["doc1", "doc2"],
+            [{"wing": "a"}, {"wing": "b"}],
+            batch_size=5000,
+            progress=lambda *args, **kwargs: None,
+        )
+
+    assert "live build failed" in str(excinfo.value)
+    assert excinfo.value.live_replaced is True
+    assert mock_backend.delete_collection.call_args_list == [
+        call("/palace", "mempalace_drawers__repair_tmp"),
+        call("/palace", "mempalace_drawers"),
+        call("/palace", "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
 # ── repair_max_seq_id ─────────────────────────────────────────────────


@@ -682,3 +1079,387 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
    # A backup file is still present — caller can roll back from it.
    leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
    assert leftover
+
+
+# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
+#
+# These tests build real chromadb palaces in tmp_path rather than mocking
+# the SQLite layer. The bug class they guard against is "extraction sees
+# different rows than chromadb stored" — the only honest check is to let
+# chromadb actually write rows and then read them back via the SQLite
+# bypass. Mocking the SQLite cursor would defeat the test.
+
+
+def _seed_palace(palace_path, collection_name, rows):
+    """Build a real chromadb palace at ``palace_path`` and add ``rows``.
+
+    ``rows`` is a list of ``(id, document, metadata)`` tuples.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    backend = ChromaBackend()
+    try:
+        col = backend.create_collection(str(palace_path), collection_name)
+        col.upsert(
+            ids=[r[0] for r in rows],
+            documents=[r[1] for r in rows],
+            metadatas=[r[2] for r in rows],
+        )
+    finally:
+        # Release chromadb's rust-side SQLite/HNSW file locks before the
+        # caller proceeds. Without this, an in-place rebuild on Windows
+        # fails with WinError 32 on data_level0.bin during the archive
+        # rename (cf. PR #1310 test-windows job).
+        backend.close()
+
+
+def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
+    """Round-trip: a chromadb palace with N upserted rows returns those
+    same N rows when read via the SQLite bypass.
+
+    Catches: anyone who breaks the segments/embeddings/embedding_metadata
+    JOIN, swaps the metadata vs vector segment, or changes how the
+    document is stored under the ``chroma:document`` key.
+
+    Also asserts every embedding row underlying the extraction lives in
+    a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
+    are stored under METADATA in Chroma's segment layout while HNSW
+    files live under ``VECTOR``; locking that assumption in here means a
+    future refactor that accidentally points the JOIN at ``VECTOR``
+    fails this test instead of silently regressing the recovery path.
+    """
+    rows = [
+        (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
+        for i in range(25)
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+
+    assert len(extracted) == 25
+    by_id = {emb_id: (doc, meta) for emb_id, doc, meta in extracted}
+    assert set(by_id) == {r[0] for r in rows}
+    for emb_id, doc, meta in rows:
+        got_doc, got_meta = by_id[emb_id]
+        assert got_doc == doc, f"document mangled for {emb_id}"
+        assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
+
+    # Lock the segment-scope assumption directly against Chroma's on-disk
+    # layout so a future change that points the extraction JOIN at the
+    # VECTOR segment cannot pass this test. Query each extracted row's
+    # backing segment scope via the same SQLite tables ``extract_via_sqlite``
+    # reads from.
+    sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
+    conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+    try:
+        scopes = {
+            scope
+            for (scope,) in conn.execute(
+                """
+                SELECT DISTINCT s.scope
+                FROM embeddings e
+                JOIN segments s ON e.segment_id = s.id
+                JOIN collections c ON s.collection = c.id
+                WHERE c.name = ? AND e.embedding_id IN ({})
+                """.format(",".join("?" * len(extracted))),
+                ("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
+            )
+        }
+    finally:
+        conn.close()
+    assert scopes == {"METADATA"}, (
+        f"extraction is reading from segments scoped {scopes!r}; only "
+        "'METADATA' should back the document/metadata rows. If Chroma's "
+        "segment layout changed, update extract_via_sqlite's WHERE clause."
+    )
+
+
+def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
+    """Chromadb stores int / float / bool / string in distinct typed
+    columns. Extraction must round-trip the original type, not coerce
+    everything to string.
+
+    Catches: a regression where the SELECT order changes and ints come
+    back as None, or where the column-resolution rule prefers the wrong
+    column.
+    """
+    rows = [
+        (
+            "drawer_typed",
+            "doc",
+            {
+                "wing": "w",
+                "chunk_index": 7,  # int
+                "score": 0.42,  # float
+                "is_active": True,  # bool
+            },
+        ),
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+    assert len(extracted) == 1
+    _, _, meta = extracted[0]
+
+    assert meta["chunk_index"] == 7 and isinstance(meta["chunk_index"], int)
+    assert meta["score"] == 0.42 and isinstance(meta["score"], float)
+    assert meta["is_active"] is True
+    assert meta["wing"] == "w"
+
+
+def test_extract_via_sqlite_unknown_collection_yields_nothing(tmp_path):
+    """Asking for a collection that isn't in the palace must return an
+    empty iterator, not silently fall back to another collection's
+    metadata segment. Seeds two real collections and queries for a third
+    name so a regression that drops the WHERE c.name=? filter would leak
+    rows from the seeded collections rather than passing.
+    """
+    _seed_palace(tmp_path, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    _seed_palace(tmp_path, "mempalace_closets", [("c1", "abbrev", {"wing": "w"})])
+    assert list(repair.extract_via_sqlite(str(tmp_path), "not_a_real_collection")) == []
+
+
+def test_extract_via_sqlite_missing_palace_yields_nothing(tmp_path):
+    """No chroma.sqlite3 → empty iterator, no exception. Callers depend
+    on this when probing speculatively."""
+    empty = tmp_path / "no_palace_here"
+    empty.mkdir()
+    assert list(repair.extract_via_sqlite(str(empty), "mempalace_drawers")) == []
+
+
+def test_rebuild_from_sqlite_roundtrips_via_real_chromadb(tmp_path):
+    """End-to-end: seed source palace, rebuild into a fresh dest, then
+    open dest with a fresh ChromaBackend and verify ``count()`` and
+    metadata filters return the original rows. Also asserts a closet
+    document round-trips so a future regression that re-embeds with the
+    wrong EF or swaps drawer/closet content would fail here.
+
+    This is the single most important regression guard. If
+    ``rebuild_from_sqlite`` silently drops rows or mangles metadata, no
+    other test in this file would catch it because they all stop at the
+    extraction layer.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    rows = [
+        (f"drawer_{i:03d}", f"body {i}", {"wing": "alpha" if i % 2 else "beta", "room": "r0"})
+        for i in range(40)
+    ]
+    _seed_palace(source, "mempalace_drawers", rows)
+    _seed_palace(
+        source,
+        "mempalace_closets",
+        [("closet_x", "abbrev pointer →drawer_001", {"wing": "alpha"})],
+    )
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {"mempalace_drawers": 40, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    drawers = backend.get_collection(str(dest), "mempalace_drawers")
+    assert drawers.count() == 40
+    alpha = drawers.get(where={"wing": "alpha"})
+    assert len(alpha["ids"]) == 20
+
+    # Spot-check that document text round-trips for one specific drawer
+    # — protects against a regression where extraction or upsert order
+    # silently swaps document bodies between IDs.
+    one = drawers.get(ids=["drawer_007"], include=["documents", "metadatas"])
+    assert one["documents"] == ["body 7"]
+    assert one["metadatas"][0]["wing"] == "alpha"
+
+    # Closets: the AAAK index layer. Re-embedded with the same EF so a
+    # known closet ID and its document body must come back intact.
+    closets = backend.get_collection(str(dest), "mempalace_closets")
+    assert closets.count() == 1
+    closet_row = closets.get(ids=["closet_x"], include=["documents", "metadatas"])
+    assert closet_row["documents"] == ["abbrev pointer →drawer_001"]
+    assert closet_row["metadatas"][0] == {"wing": "alpha"}
+
+
+def test_rebuild_from_sqlite_refuses_existing_dest(tmp_path):
+    """Refuse to write into a directory that already exists when source
+    and dest differ. Without this, an unattended re-run would silently
+    interleave a partial rebuild with whatever's already at dest.
+    """
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+    _seed_palace(source, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    dest.mkdir()
+    # Drop a marker file so we can prove the dir wasn't touched.
+    (dest / "marker.txt").write_text("preexisting")
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert (dest / "marker.txt").read_text() == "preexisting"
+    assert not (dest / "chroma.sqlite3").exists()
+
+
+def test_rebuild_from_sqlite_in_place_archives_when_opted_in(tmp_path):
+    """In-place rebuild (source == dest) with ``archive_existing_dest=True``
+    must move the original aside to ``<dest>.pre-rebuild-<ts>`` and read
+    from the archive — the original drawer rows must survive in the new
+    palace, AND the archive itself must still contain the original rows.
+
+    Catches: a refactor that moves the original out but then reads from
+    the now-empty original location, producing an empty rebuild; also
+    catches a swap that empties the archive after reading.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(15)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts["mempalace_drawers"] == 15
+
+    archives = [p for p in tmp_path.iterdir() if p.name.startswith("palace.pre-rebuild-")]
+    assert len(archives) == 1
+    assert (archives[0] / "chroma.sqlite3").exists()
+    # Archive must still hold the same row count via the SQLite bypass —
+    # proves the archive wasn't silently truncated as a side effect.
+    archived_rows = list(repair.extract_via_sqlite(str(archives[0]), "mempalace_drawers"))
+    assert len(archived_rows) == 15
+
+    from mempalace.backends.chroma import ChromaBackend
+
+    rebuilt = ChromaBackend().get_collection(str(palace), "mempalace_drawers")
+    assert rebuilt.count() == 15
+
+
+def test_rebuild_from_sqlite_in_place_refuses_without_archive_flag(tmp_path):
+    """Source == dest without archive flag must abort untouched. The
+    most catastrophic possible regression of this code path is silently
+    deleting the only copy of the user's data."""
+    palace = tmp_path / "palace"
+    _seed_palace(palace, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    sqlite_before = (palace / "chroma.sqlite3").stat().st_size
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace))
+    assert counts == {}
+    # Same file, untouched.
+    assert (palace / "chroma.sqlite3").stat().st_size == sqlite_before
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_source_missing_chroma_db(tmp_path):
+    """Source dir exists but has no chroma.sqlite3 → returns empty,
+    leaves dest untouched."""
+    source = tmp_path / "source"
+    source.mkdir()
+    (source / "stray_file").write_text("not a palace")
+    dest = tmp_path / "dest"
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert not dest.exists()
+
+
+def test_rebuild_from_sqlite_in_place_validates_source_before_archiving(tmp_path):
+    """In-place + archive_existing_dest=True with a dir that lacks
+    chroma.sqlite3 must NOT rename the dir before bailing. An earlier
+    revision archived first and validated second, leaving the user with
+    a renamed empty dir to manually undo. Catches that ordering bug.
+    """
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "marker.txt").write_text("not a real palace")
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts == {}
+    # No archive created — original dir still in place with its marker.
+    assert palace.exists()
+    assert (palace / "marker.txt").read_text() == "not a real palace"
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
+    """Mid-batch upsert failure must raise ``RebuildPartialError`` and
+    surface the failed collection + archive path so the user can recover.
+    Without this, an unattended script gets exit-code-zero on a partial
+    rebuild and the user discovers the data loss only when search starts
+    returning fewer hits.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(5)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    # Make the very first upsert raise so we don't depend on batch
+    # boundary behavior. Patching ChromaCollection.upsert (the wrapper
+    # mempalace's backend returns) keeps the failure path realistic.
+    # ``monkeypatch`` is pytest's built-in fixture that auto-restores
+    # the original attribute when the test exits, so we don't need to
+    # undo this manually.
+    from mempalace.backends.chroma import ChromaCollection
+
+    def boom(self, **kwargs):
+        raise RuntimeError("simulated chromadb upsert failure")
+
+    monkeypatch.setattr(ChromaCollection, "upsert", boom)
+
+    with pytest.raises(repair.RebuildPartialError) as excinfo:
+        repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+
+    err = excinfo.value
+    assert err.failed_collection == "mempalace_drawers"
+    assert err.partial_counts.get("mempalace_drawers") == 0
+    assert err.archive_path is not None
+    assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
+    assert err.dest_palace == os.path.abspath(str(palace))
+
+
+def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
+    """A user with a non-default drawers collection name (set via
+    ``MempalaceConfig().collection_name``) must have THAT collection
+    rebuilt — not the hardcoded ``mempalace_drawers``.
+
+    Catches: a regression where the recovery path silently rebuilds the
+    default-name collection on a custom-named palace, leaving the user's
+    actual data unrebuilt while reporting "rebuild complete." This is
+    the failure mode reviewer mjc flagged on PR #1310 as needing to line
+    up with the configured-collection-name work in #1312. Closets stay
+    fixed (``mempalace_closets``) by design — the AAAK index references
+    drawer IDs by string and is not per-deployment configurable.
+
+    Strategy: monkeypatch the lazy resolver so the test is hermetic and
+    does not depend on the global config file or env state.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    custom_drawers = "custom_drawers_xyz"
+    monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
+    closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
+    _seed_palace(source, custom_drawers, drawer_rows)
+    _seed_palace(source, "mempalace_closets", closet_rows)
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+
+    # Rebuilt under the custom name, not under the default "mempalace_drawers".
+    assert counts == {custom_drawers: 3, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
+    assert rebuilt_drawers.count() == 3
+
+    # Default-name collection must NOT exist in dest — proves we did not
+    # silently fall back to the hardcoded name during rebuild.
+    try:
+        rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
+        # If get_collection returns without raising, count() should be 0
+        # (chromadb may auto-create on get with some EFs); a non-zero
+        # count would mean we wrote rows to the wrong collection.
+        assert rebuilt_default.count() == 0, (
+            "rebuild leaked rows into the default-name collection on a "
+            "custom-name palace — recovery wrote to the wrong collection."
+        )
+    except Exception:
+        pass  # Expected: collection wasn't created.
@@ -84,6 +84,24 @@ class TestSearchMemories:
        assert "error" in result
        assert "query failed" in result["error"]

+    def test_search_memories_vector_path_uses_explicit_collection_name(self):
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "documents": [[]],
+            "metadatas": [[]],
+            "distances": [[]],
+            "ids": [[]],
+        }
+
+        with patch("mempalace.searcher.get_collection", return_value=mock_col) as get_collection:
+            search_memories("test", "/fake/path", collection_name="custom_drawers")
+
+        get_collection.assert_called_once_with(
+            "/fake/path",
+            collection_name="custom_drawers",
+            create=False,
+        )
+
    def test_search_memories_filters_in_result(self, palace_path, seeded_collection):
        result = search_memories("test", palace_path, wing="project", room="backend")
        assert result["filters"]["wing"] == "project"
@@ -102,7 +120,7 @@ class TestSearchMemories:
            "ids": [["d1", "d2"]],
        }

-        def mock_get_collection(path, create=False):
+        def mock_get_collection(path, collection_name=None, create=False):
            # First call: drawers. Second call: closets — raise so hybrid
            # degrades to pure drawer search (the catch block covers it).
            if not hasattr(mock_get_collection, "_called"):
@@ -120,6 +138,65 @@ class TestSearchMemories:
        assert none_hit["wing"] == "unknown"
        assert none_hit["room"] == "unknown"

+    def test_effective_distance_clamped_to_valid_cosine_range(self):
+        """A strong closet boost (up to 0.40) applied to a low-distance drawer
+        can drive ``dist - boost`` negative. That violates the cosine-distance
+        invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
+        internal ``_sort_key`` sinks below ordinary positive distances,
+        inverting the ranking so the best hybrid matches sort last.
+
+        With the clamp, ``effective_distance`` stays in ``[0, 2]``,
+        ``similarity`` stays in ``[0, 1]``, and the sort order is stable.
+        """
+        # Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
+        # Drawer b.md gets a larger base distance (0.35).
+        drawers_col = MagicMock()
+        drawers_col.query.return_value = {
+            "documents": [["doc-a", "doc-b"]],
+            "metadatas": [
+                [
+                    {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                    {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+                ]
+            ],
+            "distances": [[0.08, 0.35]],
+            "ids": [["d-a", "d-b"]],
+        }
+        # A strong closet at rank 0 points at a.md → boost = 0.40,
+        # which exceeds a.md's base distance and would go negative without
+        # the clamp. No closet for b.md.
+        closets_col = MagicMock()
+        closets_col.query.return_value = {
+            "documents": [["closet-preview-a"]],
+            "metadatas": [[{"source_file": "a.md"}]],
+            "distances": [[0.2]],  # within CLOSET_DISTANCE_CAP (1.5)
+            "ids": [["c-a"]],
+        }
+
+        with (
+            patch("mempalace.searcher.get_collection", return_value=drawers_col),
+            patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
+        ):
+            result = search_memories("query", "/fake/path", n_results=5)
+
+        hits = result["results"]
+        assert hits, "should return results"
+
+        # Invariants on every hit.
+        for h in hits:
+            assert (
+                0.0 <= h["similarity"] <= 1.0
+            ), f"similarity out of range: {h['similarity']} for {h['source_file']}"
+            assert 0.0 <= h["effective_distance"] <= 2.0, (
+                f"effective_distance out of range: {h['effective_distance']} "
+                f"for {h['source_file']}"
+            )
+
+        # With the clamp, the closet-boosted a.md still ranks ahead of b.md —
+        # the boost still wins, but it no longer flips the ranking.
+        assert hits[0]["source_file"] == "a.md"
+        assert hits[0]["matched_via"] == "drawer+closet"
+

 # ── BM25 internals: None / empty document safety ─────────────────────