Merge pull request #1000 from jphein/fix/quarantine-stale-hnsw
feat(backends): quarantine_stale_hnsw — recover from HNSW/sqlite drift (closes #823)
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
|
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
|
||||||
|
|
||||||
|
import datetime as _dt
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
@@ -48,6 +49,88 @@ def _validate_where(where: Optional[dict]) -> None:
|
|||||||
stack.extend(x for x in v if isinstance(x, dict))
|
stack.extend(x for x in v if isinstance(x, dict))
|
||||||
|
|
||||||
|
|
||||||
|
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
|
||||||
|
"""Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
|
||||||
|
|
||||||
|
When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
|
||||||
|
HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
|
||||||
|
graph-walk can dereference dangling neighbor pointers for entries that
|
||||||
|
exist in the metadata segment but not in the HNSW index, and segfault
|
||||||
|
in a background thread on the next ``count()`` or ``query(...)`` call.
|
||||||
|
|
||||||
|
This is the same failure mode reported at #823 (semantic search stale
|
||||||
|
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
|
||||||
|
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
|
||||||
|
chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
|
||||||
|
caused a 65–85% crash rate on fresh-process opens; fresh-process
|
||||||
|
crash rate dropped to 0% after the segment dir was renamed out of the
|
||||||
|
way and ChromaDB rebuilt lazily.
|
||||||
|
|
||||||
|
Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
|
||||||
|
than the segment's ``data_level0.bin``, the segment is considered
|
||||||
|
suspect and renamed to ``<uuid>.drift-<timestamp>``. ChromaDB reopens
|
||||||
|
cleanly without it and writes fresh index files on next use. The
|
||||||
|
original directory is renamed, not deleted, so recovery remains
|
||||||
|
possible if the heuristic misfires.
|
||||||
|
|
||||||
|
The default threshold (1h) is deliberately conservative — ChromaDB's
|
||||||
|
HNSW flush cadence means legitimate drift is normally on the order of
|
||||||
|
seconds to minutes. A segment that is more than an hour out of date is
|
||||||
|
almost certainly in a "crashed mid-write" state.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
palace_path: path to the palace directory containing ``chroma.sqlite3``
|
||||||
|
stale_seconds: minimum mtime gap to treat a segment as stale
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of paths that were quarantined (empty if nothing drifted).
|
||||||
|
"""
|
||||||
|
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
if not os.path.isfile(db_path):
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
sqlite_mtime = os.path.getmtime(db_path)
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
moved: list[str] = []
|
||||||
|
try:
|
||||||
|
entries = os.listdir(palace_path)
|
||||||
|
except OSError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
for name in entries:
|
||||||
|
if "-" not in name or name.startswith(".") or ".drift-" in name:
|
||||||
|
continue
|
||||||
|
seg_dir = os.path.join(palace_path, name)
|
||||||
|
if not os.path.isdir(seg_dir):
|
||||||
|
continue
|
||||||
|
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
|
||||||
|
if not os.path.isfile(hnsw_bin):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
hnsw_mtime = os.path.getmtime(hnsw_bin)
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
if sqlite_mtime - hnsw_mtime < stale_seconds:
|
||||||
|
continue
|
||||||
|
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
target = f"{seg_dir}.drift-{stamp}"
|
||||||
|
try:
|
||||||
|
os.rename(seg_dir, target)
|
||||||
|
moved.append(target)
|
||||||
|
logger.warning(
|
||||||
|
"Quarantined stale HNSW segment %s "
|
||||||
|
"(sqlite %.0fs newer than HNSW); renamed to %s",
|
||||||
|
seg_dir,
|
||||||
|
sqlite_mtime - hnsw_mtime,
|
||||||
|
target,
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
|
||||||
|
return moved
|
||||||
|
|
||||||
|
|
||||||
def _fix_blob_seq_ids(palace_path: str) -> None:
|
def _fix_blob_seq_ids(palace_path: str) -> None:
|
||||||
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
||||||
|
|
||||||
|
|||||||
+72
-1
@@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
@@ -11,7 +12,12 @@ from mempalace.backends import (
|
|||||||
available_backends,
|
available_backends,
|
||||||
get_backend,
|
get_backend,
|
||||||
)
|
)
|
||||||
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
|
from mempalace.backends.chroma import (
|
||||||
|
ChromaBackend,
|
||||||
|
ChromaCollection,
|
||||||
|
_fix_blob_seq_ids,
|
||||||
|
quarantine_stale_hnsw,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class _FakeCollection:
|
class _FakeCollection:
|
||||||
@@ -372,3 +378,68 @@ def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
|
|||||||
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
||||||
"""No error when palace has no chroma.sqlite3."""
|
"""No error when palace has no chroma.sqlite3."""
|
||||||
_fix_blob_seq_ids(str(tmp_path)) # should not raise
|
_fix_blob_seq_ids(str(tmp_path)) # should not raise
|
||||||
|
|
||||||
|
|
||||||
|
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
|
||||||
|
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
(palace / "chroma.sqlite3").write_text("")
|
||||||
|
seg = palace / "abcd-1234-5678"
|
||||||
|
seg.mkdir()
|
||||||
|
(seg / "data_level0.bin").write_text("")
|
||||||
|
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
||||||
|
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
||||||
|
return palace, seg
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
|
||||||
|
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert len(moved) == 1
|
||||||
|
assert ".drift-" in moved[0]
|
||||||
|
assert not seg.exists()
|
||||||
|
# the renamed directory still exists and contains the original file
|
||||||
|
renamed = list(palace.iterdir())
|
||||||
|
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
||||||
|
assert len(drift_dirs) == 1
|
||||||
|
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
||||||
|
"""Segment with recent mtime vs sqlite is not touched."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert moved == []
|
||||||
|
assert seg.exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_no_palace(tmp_path):
|
||||||
|
"""Missing palace path or chroma.sqlite3: return [] without raising."""
|
||||||
|
assert quarantine_stale_hnsw(str(tmp_path / "missing")) == []
|
||||||
|
empty = tmp_path / "empty"
|
||||||
|
empty.mkdir()
|
||||||
|
assert quarantine_stale_hnsw(str(empty)) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
|
||||||
|
"""Directories already named with ``.drift-`` suffix are never re-renamed."""
|
||||||
|
now = 1_700_000_000.0
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
(palace / "chroma.sqlite3").write_text("")
|
||||||
|
os.utime(palace / "chroma.sqlite3", (now, now))
|
||||||
|
drift = palace / "abcd-1234.drift-20260101-000000"
|
||||||
|
drift.mkdir()
|
||||||
|
(drift / "data_level0.bin").write_text("")
|
||||||
|
os.utime(drift / "data_level0.bin", (now - 99999, now - 99999))
|
||||||
|
|
||||||
|
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||||
|
assert moved == []
|
||||||
|
assert drift.exists()
|
||||||
|
|||||||
Reference in New Issue
Block a user