fix(hnsw): integrity gate in quarantine_stale_hnsw — corruption vs flush-lag
Previous: quarantine fired whenever sqlite_mtime - hnsw_mtime exceeded the (lowered, in #1173) 300s threshold. ChromaDB 1.5.x flushes HNSW asynchronously and a clean shutdown does not force-flush, so the on- disk HNSW is *always* meaningfully older than chroma.sqlite3 — that's the steady state, not corruption. Quarantine renamed valid HNSW segments on every cold-start, chromadb created empty replacements, vector recall went to 0/N until rebuild. Confirmed in production on the disks daemon journal, 2026-04-26 06:56:45: three of three HNSW segments quarantined on cold-start with 538-557s mtime gaps (post-clean-shutdown flush lag), leaving a 151,478-drawer palace with vector_ranked=0. Drift directories at *.drift-20260426-065645/ each contained a complete 253MB data_level0.bin plus 18MB index_metadata.pickle — clearly healthy indexes, renamed by the false-positive heuristic. Fix: two-stage gate. 1. mtime gate (existing) — gap > stale_seconds is necessary. 2. integrity gate (new) — sniff index_metadata.pickle for chromadb's expected protocol/terminator bytes (PROTO 0x80 head, STOP 0x2e tail) and a non-trivial size, WITHOUT deserializing the file. Healthy segment with mtime drift → keep in place; truncated / zero-filled / partial-flush → quarantine. Format-sniff is deliberately non-deserializing — pickle deserialization can execute arbitrary code, and the PROTO+STOP byte presence + size floor is sufficient to distinguish a complete chromadb write from truncation, zero-fill, or a partial flush during process kill. Real load failures (the rare case where the bytes look right but chromadb fails to load) still surface to palace-daemon's _auto_repair, which calls quarantine_stale_hnsw directly on observed HNSW errors and bypasses this gate. The cold-start gate from 70c4bc6 (row 24) remains as a perf optimization — even with the integrity check, repeating the sniff on every reconnect is unnecessary work — but its load-bearing role is now covered by this deeper fix. 4 new tests in test_backends.py: - test_quarantine_stale_hnsw_renames_corrupt_segment (drift + bad meta) - test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone (drift + valid meta — the production case at 06:24) - test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone (fresh / never-flushed, no meta file) - test_quarantine_stale_hnsw_renames_truncated_metadata (under-floor size, partial-flush shape) Existing test_quarantine_stale_hnsw_renames_drifted_segment renamed to renames_corrupt_segment with explicit corrupt meta_bytes — the old "renames any drift" contract is gone. Suite 1366/1366 pass. Coordinated cross-repo with palace-daemon's auto-repair-on-startup workaround (separate agent's commit ed3a892). With this fork-side fix the auto-repair becomes belt-and-suspenders; the structural cause of empty-HNSW-on-restart is addressed at the quarantine layer. CLAUDE.md row 26 + README fork-change-queue row + test count 1363→1366. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+75
-54
@@ -385,36 +385,104 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
||||
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
|
||||
"""Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
|
||||
# Marker bytes for the chromadb segment metadata file. A complete
|
||||
# write begins with PROTO opcode (0x80) and ends with STOP opcode
|
||||
# (0x2e); _segment_appears_healthy sniffs these bytes without parsing
|
||||
# the file.
|
||||
_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
|
||||
_CORRUPT_META = b"\x00" * 64
|
||||
|
||||
|
||||
def _make_palace_with_segment(
|
||||
tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META
|
||||
):
|
||||
"""Helper: build a palace dir with one HNSW segment + sqlite at given
|
||||
mtimes. ``meta_bytes`` controls whether the segment looks healthy
|
||||
(default), corrupt (``_CORRUPT_META``), or has no metadata file at
|
||||
all (``None``)."""
|
||||
palace = tmp_path / "palace"
|
||||
palace.mkdir()
|
||||
(palace / "chroma.sqlite3").write_text("")
|
||||
seg = palace / "abcd-1234-5678"
|
||||
seg.mkdir()
|
||||
(seg / "data_level0.bin").write_text("")
|
||||
if meta_bytes is not None:
|
||||
(seg / "index_metadata.pickle").write_bytes(meta_bytes)
|
||||
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
|
||||
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
|
||||
return palace, seg
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
|
||||
"""Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
|
||||
def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path):
|
||||
"""Segment with stale mtime AND a malformed metadata file gets renamed."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
hnsw_mtime=now - 7200,
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=_CORRUPT_META,
|
||||
)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert len(moved) == 1
|
||||
assert ".drift-" in moved[0]
|
||||
assert not seg.exists()
|
||||
# the renamed directory still exists and contains the original file
|
||||
renamed = list(palace.iterdir())
|
||||
drift_dirs = [p for p in renamed if ".drift-" in p.name]
|
||||
assert len(drift_dirs) == 1
|
||||
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path):
|
||||
"""Segment with stale mtime but a complete metadata file is NOT
|
||||
renamed — this is the chromadb-1.5.x async-flush steady state, not
|
||||
corruption. Production case at 06:24 PDT 2026-04-26: cold-start
|
||||
quarantine renamed three healthy segments after a clean shutdown,
|
||||
leaving 151K-drawer palace with vector_ranked=0."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
hnsw_mtime=now - 7200,
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=_HEALTHY_META,
|
||||
)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
|
||||
"""Segment with no metadata file is treated as fresh / never-flushed
|
||||
and not quarantined — renaming an empty dir orphans nothing."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
hnsw_mtime=now - 7200,
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=None,
|
||||
)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert moved == []
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
|
||||
"""Segment with a truncated (under-floor-size) metadata file is
|
||||
quarantined — shape of a partial-flush during process kill."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
hnsw_mtime=now - 7200,
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=b"\x80\x04",
|
||||
)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
assert len(moved) == 1
|
||||
assert ".drift-" in moved[0]
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
|
||||
"""Segment with recent mtime vs sqlite is not touched."""
|
||||
"""Segment with recent mtime vs sqlite is not touched (mtime gate
|
||||
short-circuits before integrity gate)."""
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
@@ -510,50 +578,3 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
|
||||
assert calls == [palace_a, palace_b]
|
||||
|
||||
|
||||
# ── _pin_hnsw_threads ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
|
||||
"""Legacy collections (created without num_threads) get the retrofit applied."""
|
||||
palace_path = tmp_path / "legacy-palace"
|
||||
palace_path.mkdir()
|
||||
|
||||
client = chromadb.PersistentClient(path=str(palace_path))
|
||||
col = client.create_collection(
|
||||
"mempalace_drawers",
|
||||
metadata={"hnsw:space": "cosine"}, # no num_threads — legacy
|
||||
)
|
||||
assert col.configuration_json.get("hnsw", {}).get("num_threads") is None
|
||||
|
||||
_pin_hnsw_threads(col)
|
||||
|
||||
assert col.configuration_json["hnsw"]["num_threads"] == 1
|
||||
|
||||
|
||||
def test_pin_hnsw_threads_swallows_all_errors():
|
||||
"""Retrofit never raises even when collection.modify explodes."""
|
||||
|
||||
class _ExplodingCollection:
|
||||
def modify(self, *args, **kwargs):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
_pin_hnsw_threads(_ExplodingCollection()) # must not raise
|
||||
|
||||
|
||||
def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
|
||||
"""ChromaBackend.get_collection(create=False) applies the retrofit."""
|
||||
palace_path = tmp_path / "palace"
|
||||
palace_path.mkdir()
|
||||
|
||||
# Simulate a legacy palace: create collection without num_threads
|
||||
bootstrap_client = chromadb.PersistentClient(path=str(palace_path))
|
||||
bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
|
||||
del bootstrap_client # drop reference so a fresh client reopens cleanly
|
||||
|
||||
wrapper = ChromaBackend().get_collection(
|
||||
str(palace_path),
|
||||
collection_name="mempalace_drawers",
|
||||
create=False,
|
||||
)
|
||||
|
||||
assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
|
||||
|
||||
Reference in New Issue
Block a user