fix(hnsw): gate quarantine_stale_hnsw to cold-start, not every reconnect
Symptom on the canonical disks daemon: drift quarantines firing every 10–30 minutes throughout the day under steady write load. Logs show .drift-* directories accumulating despite the daemon being the only writer (no Syncthing replication of palace data). Root cause is a false-positive thrash in the quarantine heuristic: - chroma.sqlite3 mtime bumps on every write (millisecond cadence). - HNSW segment files (data_level0.bin) only flush to disk on chromadb's internal cadence, which can lag minutes behind sqlite under continuous write load. Once the gap exceeds the 300s threshold, quarantine_stale_hnsw renames a perfectly valid HNSW segment, chromadb rebuilds it from scratch, and the cycle repeats as soon as the next batch of writes lands. The 300s threshold (lowered from 3600s in PR #1173 after a 0.96h-drift production segfault) is correct for the cross-machine-replication failure mode it was designed for, but wrong for a daemon-strict deployment whose only "drift" source is its own benign flush lag. Fix: gate the proactive quarantine check to the first ``make_client()`` invocation per palace per process (``ChromaBackend._quarantined_paths`` set). Real cold-start drift (replication, partial restore, crashed-mid- write) still gets caught — that's exactly when a fresh daemon process opens the palace. Real runtime drift on observed HNSW errors still gets caught via palace-daemon's ``_auto_repair`` which calls ``quarantine_stale_hnsw`` directly, bypassing this gate. Two new tests in test_backends.py verify single-fire-per-palace and per-palace independence. Conftest clears the gate between tests. Suite 1362/1362 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -46,6 +46,14 @@ def _reset_mcp_cache():
|
||||
mcp_server._collection_cache = None
|
||||
except (ImportError, AttributeError):
|
||||
pass
|
||||
try:
|
||||
# Reset the per-process quarantine gate so tests don't leak
|
||||
# state through ChromaBackend._quarantined_paths.
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
ChromaBackend._quarantined_paths.clear()
|
||||
except (ImportError, AttributeError):
|
||||
pass
|
||||
|
||||
_clear_cache()
|
||||
yield
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import chromadb
|
||||
import pytest
|
||||
@@ -446,6 +447,69 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
|
||||
assert drift.exists()
|
||||
|
||||
|
||||
# ── make_client cold-start gate ──────────────────────────────────────────
|
||||
|
||||
|
||||
def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
|
||||
"""Quarantine fires on first ``make_client()`` for a palace, then is
|
||||
skipped on subsequent calls — prevents runtime thrash where a daemon's
|
||||
own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
|
||||
making the mtime heuristic falsely trigger every reconnect."""
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
palace_path = str(tmp_path / "palace")
|
||||
os.makedirs(palace_path, exist_ok=True)
|
||||
(Path(palace_path) / "chroma.sqlite3").write_text("")
|
||||
|
||||
# Reset the per-process cache so this test is independent of others.
|
||||
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def _spy(path, stale_seconds=300.0):
|
||||
calls.append(path)
|
||||
return []
|
||||
|
||||
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
|
||||
|
||||
ChromaBackend.make_client(palace_path)
|
||||
ChromaBackend.make_client(palace_path)
|
||||
ChromaBackend.make_client(palace_path)
|
||||
|
||||
assert calls == [palace_path], (
|
||||
"quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
|
||||
)
|
||||
|
||||
|
||||
def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
|
||||
"""Two distinct palaces each get one quarantine attempt — the gate is
|
||||
keyed by palace path, not global."""
|
||||
from mempalace.backends.chroma import ChromaBackend
|
||||
|
||||
palace_a = str(tmp_path / "palace_a")
|
||||
palace_b = str(tmp_path / "palace_b")
|
||||
for p in (palace_a, palace_b):
|
||||
os.makedirs(p, exist_ok=True)
|
||||
(Path(p) / "chroma.sqlite3").write_text("")
|
||||
|
||||
monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def _spy(path, stale_seconds=300.0):
|
||||
calls.append(path)
|
||||
return []
|
||||
|
||||
monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
|
||||
|
||||
ChromaBackend.make_client(palace_a)
|
||||
ChromaBackend.make_client(palace_b)
|
||||
ChromaBackend.make_client(palace_a) # already gated
|
||||
ChromaBackend.make_client(palace_b) # already gated
|
||||
|
||||
assert calls == [palace_a, palace_b]
|
||||
|
||||
|
||||
# ── _pin_hnsw_threads ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user