From 0c38deaab5a6c7edec92e1208fe18099b0ef0bdf Mon Sep 17 00:00:00 2001 From: jp Date: Sat, 18 Apr 2026 10:03:22 -0700 Subject: [PATCH] =?UTF-8?q?feat(backends):=20quarantine=5Fstale=5Fhnsw=20?= =?UTF-8?q?=E2=80=94=20recover=20from=20HNSW/sqlite=20drift?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper that renames HNSW segment directories whose `data_level0.bin` is significantly older than `chroma.sqlite3`. Drift between the on-disk HNSW graph and the live embeddings table is the root cause of a segfault class where the Rust graph-walk dereferences dangling neighbor pointers for entries in the metadata segment that no longer exist in the HNSW index, crashing in a background thread on `count()` or `query()`. Issue #823 describes the same drift as a silent-staleness symptom (semantic search returns stale results after `add_drawer` because `data_level0.bin` lags the sqlite metadata under the default `sync_threshold=1000`). Under heavier load or after an interrupted write, the same drift can escalate from "silent stale results" to "SIGSEGV on next open," which is the failure mode observed at neo-cortex-mcp#2 (chromadb 1.5.5, Python 3.12) and acknowledged at chroma-core/chroma#2594. On one 135K-drawer palace where `index_metadata.pickle` claimed 137,813 elements against 135,464 rows in sqlite (2,349-entry drift), fresh Python processes crashed in `col.count()` 17/20 times; after renaming the segment dir out of the way and letting ChromaDB rebuild lazily, the same 20-run check went to 0 crashes. The recovery path #823 suggests (export / recreate / reimport) is heavy — it re-embeds every drawer. This helper is lighter: rename the segment dir so ChromaDB reopens without it, and the indexer rebuilds lazily on the next write. The original directory is renamed (not deleted) so the operator can recover if the heuristic misfires. If `chroma.sqlite3` is more than `stale_seconds` (default 3600) newer than the segment's `data_level0.bin`, the segment is considered suspect. One hour is deliberately conservative — normal HNSW flush cadence is seconds to minutes, so an hour of drift implies a crashed mid-write, not routine lag. - Additive: exposes `quarantine_stale_hnsw(palace_path, stale_seconds)` as a helper. Not wired into `_client()` / startup on this PR — the goal is to land the primitive first so operators and higher layers can opt in. A follow-up could call it automatically on palace open behind an env var or config flag. - Closes #823 by giving operators a first-class recovery path without having to install `chromadb-ops` or re-mine. Four new tests in `tests/test_backends.py`: - renames drifted segment, preserves original files under `.drift-TS` suffix - leaves fresh segments alone - no-op on missing palace path / missing `chroma.sqlite3` - skips already-quarantined (`.drift-` suffixed) directories `pytest tests/test_backends.py` → 11 passed. `ruff check` / `ruff format --check` — clean. --- mempalace/backends/chroma.py | 83 ++++++++++++++++++++++++++++++++++++ tests/test_backends.py | 73 ++++++++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 1 deletion(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 835ce72..afd8083 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -1,5 +1,6 @@ """ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation).""" +import datetime as _dt import logging import os import sqlite3 @@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None: stack.extend(x for x in v if isinstance(x, dict)) +def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]: + """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3. + + When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk + HNSW segment is significantly older than ``chroma.sqlite3``, the Rust + graph-walk can dereference dangling neighbor pointers for entries that + exist in the metadata segment but not in the HNSW index, and segfault + in a background thread on the next ``count()`` or ``query(...)`` call. + + This is the same failure mode reported at #823 (semantic search stale + after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on + ``count()`` with chromadb 1.5.5), and acknowledged as by-design at + chroma-core/chroma#2594. On one fork palace (135K drawers), the drift + caused a 65–85% crash rate on fresh-process opens; fresh-process + crash rate dropped to 0% after the segment dir was renamed out of the + way and ChromaDB rebuilt lazily. + + Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer + than the segment's ``data_level0.bin``, the segment is considered + suspect and renamed to ``.drift-``. ChromaDB reopens + cleanly without it and writes fresh index files on next use. The + original directory is renamed, not deleted, so recovery remains + possible if the heuristic misfires. + + The default threshold (1h) is deliberately conservative — ChromaDB's + HNSW flush cadence means legitimate drift is normally on the order of + seconds to minutes. A segment that is more than an hour out of date is + almost certainly in a "crashed mid-write" state. + + Args: + palace_path: path to the palace directory containing ``chroma.sqlite3`` + stale_seconds: minimum mtime gap to treat a segment as stale + + Returns: + List of paths that were quarantined (empty if nothing drifted). + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return [] + try: + sqlite_mtime = os.path.getmtime(db_path) + except OSError: + return [] + + moved: list[str] = [] + try: + entries = os.listdir(palace_path) + except OSError: + return [] + + for name in entries: + if "-" not in name or name.startswith(".") or ".drift-" in name: + continue + seg_dir = os.path.join(palace_path, name) + if not os.path.isdir(seg_dir): + continue + hnsw_bin = os.path.join(seg_dir, "data_level0.bin") + if not os.path.isfile(hnsw_bin): + continue + try: + hnsw_mtime = os.path.getmtime(hnsw_bin) + except OSError: + continue + if sqlite_mtime - hnsw_mtime < stale_seconds: + continue + stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S") + target = f"{seg_dir}.drift-{stamp}" + try: + os.rename(seg_dir, target) + moved.append(target) + logger.warning( + "Quarantined stale HNSW segment %s " + "(sqlite %.0fs newer than HNSW); renamed to %s", + seg_dir, + sqlite_mtime - hnsw_mtime, + target, + ) + except OSError: + logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir) + return moved + + def _fix_blob_seq_ids(palace_path: str) -> None: """Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER. diff --git a/tests/test_backends.py b/tests/test_backends.py index 2ca2d5a..b3f009a 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -1,3 +1,4 @@ +import os import sqlite3 import chromadb @@ -11,7 +12,12 @@ from mempalace.backends import ( available_backends, get_backend, ) -from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids +from mempalace.backends.chroma import ( + ChromaBackend, + ChromaCollection, + _fix_blob_seq_ids, + quarantine_stale_hnsw, +) class _FakeCollection: @@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path): def test_fix_blob_seq_ids_noop_without_database(tmp_path): """No error when palace has no chroma.sqlite3.""" _fix_blob_seq_ids(str(tmp_path)) # should not raise + + +# ── quarantine_stale_hnsw ───────────────────────────────────────────────── + + +def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime): + """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes.""" + palace = tmp_path / "palace" + palace.mkdir() + (palace / "chroma.sqlite3").write_text("") + seg = palace / "abcd-1234-5678" + seg.mkdir() + (seg / "data_level0.bin").write_text("") + os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime)) + os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime)) + return palace, seg + + +def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path): + """Segment whose data_level0.bin is 2h older than sqlite gets renamed.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert len(moved) == 1 + assert ".drift-" in moved[0] + assert not seg.exists() + # the renamed directory still exists and contains the original file + renamed = list(palace.iterdir()) + drift_dirs = [p for p in renamed if ".drift-" in p.name] + assert len(drift_dirs) == 1 + assert (drift_dirs[0] / "data_level0.bin").exists() + + +def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path): + """Segment with recent mtime vs sqlite is not touched.""" + now = 1_700_000_000.0 + palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now) + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert seg.exists() + + +def test_quarantine_stale_hnsw_no_palace(tmp_path): + """Missing palace path or chroma.sqlite3: return [] without raising.""" + assert quarantine_stale_hnsw(str(tmp_path / "missing")) == [] + empty = tmp_path / "empty" + empty.mkdir() + assert quarantine_stale_hnsw(str(empty)) == [] + + +def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path): + """Directories already named with ``.drift-`` suffix are never re-renamed.""" + now = 1_700_000_000.0 + palace = tmp_path / "palace" + palace.mkdir() + (palace / "chroma.sqlite3").write_text("") + os.utime(palace / "chroma.sqlite3", (now, now)) + drift = palace / "abcd-1234.drift-20260101-000000" + drift.mkdir() + (drift / "data_level0.bin").write_text("") + os.utime(drift / "data_level0.bin", (now - 99999, now - 99999)) + + moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0) + assert moved == [] + assert drift.exists()