Merge pull request #1000 from jphein/fix/quarantine-stale-hnsw

feat(backends): quarantine_stale_hnsw — recover from HNSW/sqlite drift (closes #823)
This commit is contained in:
Ben Sigman
2026-04-18 23:28:00 -07:00
committed by GitHub
2 changed files with 155 additions and 1 deletions
+83
View File
@@ -1,5 +1,6 @@
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
import datetime as _dt
import logging
import os
import sqlite3
@@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None:
stack.extend(x for x in v if isinstance(x, dict))
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
graph-walk can dereference dangling neighbor pointers for entries that
exist in the metadata segment but not in the HNSW index, and segfault
in a background thread on the next ``count()`` or ``query(...)`` call.
This is the same failure mode reported at #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
caused a 6585% crash rate on fresh-process opens; fresh-process
crash rate dropped to 0% after the segment dir was renamed out of the
way and ChromaDB rebuilt lazily.
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
than the segment's ``data_level0.bin``, the segment is considered
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
cleanly without it and writes fresh index files on next use. The
original directory is renamed, not deleted, so recovery remains
possible if the heuristic misfires.
The default threshold (1h) is deliberately conservative — ChromaDB's
HNSW flush cadence means legitimate drift is normally on the order of
seconds to minutes. A segment that is more than an hour out of date is
almost certainly in a "crashed mid-write" state.
Args:
palace_path: path to the palace directory containing ``chroma.sqlite3``
stale_seconds: minimum mtime gap to treat a segment as stale
Returns:
List of paths that were quarantined (empty if nothing drifted).
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return []
try:
sqlite_mtime = os.path.getmtime(db_path)
except OSError:
return []
moved: list[str] = []
try:
entries = os.listdir(palace_path)
except OSError:
return []
for name in entries:
if "-" not in name or name.startswith(".") or ".drift-" in name:
continue
seg_dir = os.path.join(palace_path, name)
if not os.path.isdir(seg_dir):
continue
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
if not os.path.isfile(hnsw_bin):
continue
try:
hnsw_mtime = os.path.getmtime(hnsw_bin)
except OSError:
continue
if sqlite_mtime - hnsw_mtime < stale_seconds:
continue
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined stale HNSW segment %s "
"(sqlite %.0fs newer than HNSW); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
target,
)
except OSError:
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
return moved
def _fix_blob_seq_ids(palace_path: str) -> None:
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
+72 -1
View File
@@ -1,3 +1,4 @@
import os
import sqlite3
import chromadb
@@ -11,7 +12,12 @@ from mempalace.backends import (
available_backends,
get_backend,
)
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
from mempalace.backends.chroma import (
ChromaBackend,
ChromaCollection,
_fix_blob_seq_ids,
quarantine_stale_hnsw,
)
class _FakeCollection:
@@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
"""No error when palace has no chroma.sqlite3."""
_fix_blob_seq_ids(str(tmp_path)) # should not raise
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
palace = tmp_path / "palace"
palace.mkdir()
(palace / "chroma.sqlite3").write_text("")
seg = palace / "abcd-1234-5678"
seg.mkdir()
(seg / "data_level0.bin").write_text("")
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
return palace, seg
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
assert not seg.exists()
# the renamed directory still exists and contains the original file
renamed = list(palace.iterdir())
drift_dirs = [p for p in renamed if ".drift-" in p.name]
assert len(drift_dirs) == 1
assert (drift_dirs[0] / "data_level0.bin").exists()
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
"""Segment with recent mtime vs sqlite is not touched."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()
def test_quarantine_stale_hnsw_no_palace(tmp_path):
"""Missing palace path or chroma.sqlite3: return [] without raising."""
assert quarantine_stale_hnsw(str(tmp_path / "missing")) == []
empty = tmp_path / "empty"
empty.mkdir()
assert quarantine_stale_hnsw(str(empty)) == []
def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
"""Directories already named with ``.drift-`` suffix are never re-renamed."""
now = 1_700_000_000.0
palace = tmp_path / "palace"
palace.mkdir()
(palace / "chroma.sqlite3").write_text("")
os.utime(palace / "chroma.sqlite3", (now, now))
drift = palace / "abcd-1234.drift-20260101-000000"
drift.mkdir()
(drift / "data_level0.bin").write_text("")
os.utime(drift / "data_level0.bin", (now - 99999, now - 99999))
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert drift.exists()