fix(backends/chroma): wire quarantine_stale_hnsw into _client() to prevent SIGSEGV on stale HNSW (#1121, #1132, #1263)

PR #1173 wired quarantine_stale_hnsw into the static make_client() helper
but not into the instance _client() method. As a result every non-MCP
entry point (CLI mining, search, repair, status) — which all use
get_collection / _get_or_create_collection / _client() — skipped the
cold-start quarantine pass and could SIGSEGV on a stale HNSW segment
left over from a partial flush, replicated palace, or crashed-mid-write.

Refactor: extract the (_fix_blob_seq_ids + gated quarantine_stale_hnsw)
pre-open pass into a single private static helper
ChromaBackend._prepare_palace_for_open(). Both make_client() and
_client() now route through it, so the _quarantined_paths once-per-
palace-per-process gate is preserved (no runtime thrash on hot paths)
and behaviour stays identical — the fix is purely about extending the
existing protection to the path that was missing it.

Tests:

- test_client_quarantines_corrupt_segment_on_first_open mirrors the
  existing make_client test and verifies _client() actually renames a
  corrupt segment on first open.
- test_client_quarantines_only_on_first_call_per_palace verifies the
  cache gate prevents re-running quarantine across repeated _client()
  calls — important because _client() is hit on every backend op.

Closes #1121. Closes #1132. Closes #1263.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
igorls
2026-05-02 22:56:36 -03:00
parent d07b730f08
commit 10733f1df4
2 changed files with 88 additions and 5 deletions
+27 -5
View File
@@ -993,7 +993,7 @@ class ChromaBackend(BaseBackend):
)
if cached is None or inode_changed or mtime_changed or mtime_appeared:
_fix_blob_seq_ids(palace_path)
ChromaBackend._prepare_palace_for_open(palace_path)
cached = chromadb.PersistentClient(path=palace_path)
self._clients[palace_path] = cached
# Re-stat after the client constructor runs: chromadb creates
@@ -1028,6 +1028,31 @@ class ChromaBackend(BaseBackend):
# safety property; locking would add cost without correctness gain.
_quarantined_paths: set[str] = set()
@staticmethod
def _prepare_palace_for_open(palace_path: str) -> None:
"""Run the pre-open safety pass shared by :meth:`make_client` and
:meth:`_client`.
Two steps, both required before constructing a ``PersistentClient``:
1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
certain chromadb migrations.
2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
it fires once per palace per process. This is the SIGSEGV
prevention path for stale HNSW segments (see #1121, #1132, #1263);
wiring it through this helper means CLI mining, search, repair,
and status all benefit, not just the legacy ``make_client``
callers.
Idempotent: safe to call from any code path that is about to open or
re-open a palace. The ``_quarantined_paths`` gate prevents thrash on
hot paths (e.g. ``_client()`` is called on every backend operation).
"""
_fix_blob_seq_ids(palace_path)
if palace_path not in ChromaBackend._quarantined_paths:
quarantine_stale_hnsw(palace_path)
ChromaBackend._quarantined_paths.add(palace_path)
@staticmethod
def make_client(palace_path: str):
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -1040,10 +1065,7 @@ class ChromaBackend(BaseBackend):
:attr:`_quarantined_paths` for the rationale (cold-start protection
vs. runtime thrash on steady-write daemons).
"""
_fix_blob_seq_ids(palace_path)
if palace_path not in ChromaBackend._quarantined_paths:
quarantine_stale_hnsw(palace_path)
ChromaBackend._quarantined_paths.add(palace_path)
ChromaBackend._prepare_palace_for_open(palace_path)
return chromadb.PersistentClient(path=palace_path)
@staticmethod
+61
View File
@@ -764,6 +764,67 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
assert calls == [palace_a, palace_b]
# ── _client() cold-start gate (#1121, #1132, #1263) ──────────────────────
def test_client_quarantines_corrupt_segment_on_first_open(tmp_path, monkeypatch):
"""The instance ``_client()`` path must run ``quarantine_stale_hnsw``
on first open, mirroring the ``make_client()`` static helper. Before
PR #1173's wiring was extended here, CLI mining / search / repair /
status all skipped the quarantine pass and would SIGSEGV on a stale
HNSW segment (#1121, #1132, #1263)."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=_CORRUPT_META,
)
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
backend = ChromaBackend()
try:
backend._client(str(palace))
finally:
backend.close()
assert not seg.exists(), "_client() should have quarantined the corrupt segment"
drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
assert len(drift_dirs) == 1
def test_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
"""Repeated ``_client()`` calls for the same palace re-run quarantine
at most once — the ``_quarantined_paths`` gate prevents runtime
thrash on hot paths (``_client()`` is hit on every backend op)."""
palace_path = str(tmp_path / "palace")
os.makedirs(palace_path, exist_ok=True)
(Path(palace_path) / "chroma.sqlite3").write_text("")
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
calls: list[str] = []
def _spy(path, stale_seconds=300.0):
calls.append(path)
return []
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
backend = ChromaBackend()
try:
backend._client(palace_path)
backend._client(palace_path)
backend._client(palace_path)
finally:
backend.close()
assert calls == [palace_path], (
"quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
)
# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──