Merge pull request #1339 from fatkobra/fix/1218-hnsw-link-payload-health

fix(storage): quarantine bloated HNSW link payloads (#1218)
This commit is contained in:
Igor Lins e Silva
2026-05-07 10:19:47 -03:00
committed by GitHub
2 changed files with 205 additions and 55 deletions
+92 -55
View File
@@ -32,6 +32,51 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
# A healthy HNSW payload should keep link_lists.bin proportional to
# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
# index_metadata.pickle is structurally valid.
#
# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
# Treat only >10x as corruption so normal flush lag or small segments do not get
# quarantined.
_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
"""Return link_lists.bin / data_level0.bin size ratio for a segment.
``None`` means the ratio is not meaningful, usually because one file is
missing or data_level0.bin is empty. ``float("inf")`` means the files were
present but could not be statted safely, which should be treated as
suspicious by callers.
"""
link_path = os.path.join(seg_dir, "link_lists.bin")
data_path = os.path.join(seg_dir, "data_level0.bin")
if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
return None
try:
data_size = os.path.getsize(data_path)
link_size = os.path.getsize(link_path)
except OSError:
return float("inf")
if data_size <= 0:
return None
return link_size / data_size
def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
"""Return False when HNSW payload files are structurally implausible."""
ratio = _hnsw_link_to_data_ratio(seg_dir)
return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
#
# With default params (batch_size=100, sync_threshold=1000, initial capacity
@@ -109,6 +154,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
files and quarantine_stale_hnsw would conservatively rename them
out of the way (lazy rebuild on next open recovers).
"""
if not _hnsw_payload_appears_sane(seg_dir):
return False
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path):
# No metadata file yet — segment hasn't flushed (fresh / empty).
@@ -130,64 +178,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
integrity sniff-test.
"""Rename HNSW segment dirs that look unsafe to open.
Catches the segfault failure mode from #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
rebuild lazily on next open instead of segfaulting.
This catches two classes of HNSW corruption before ChromaDB opens the
native segment reader:
Two-stage check:
1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
existing format sniff-test;
2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
larger than ``data_level0.bin``.
1. **mtime gate.** If ``chroma.sqlite3`` is less than
``stale_seconds`` newer than the segment's ``data_level0.bin``,
skip — chromadb is in normal write-path territory.
The second check is intentionally not gated by mtime. A segment with a
300x link/data ratio is unsafe regardless of whether its mtime is recent;
letting Chroma open it can SIGSEGV before Python fallback code runs.
2. **Integrity gate** (``_segment_appears_healthy``). Even when the
mtime gap exceeds the threshold, a segment whose
``index_metadata.pickle`` passes a format sniff-test is healthy:
chromadb 1.5.x flushes HNSW state asynchronously and a clean
shutdown does NOT force-flush, so the on-disk HNSW is *always*
somewhat older than ``chroma.sqlite3``. Production observation
(2026-04-26 disks daemon): three of three segments quarantined
on every cold start, with 538-557s gaps, leaving the 151K-drawer
palace with vector_ranked=0 until rebuild. Renaming a healthy
segment based on mtime alone destroys a valid index — chromadb
creates an empty replacement, orphaning every drawer in sqlite
from vector recall until the operator runs ``mempalace repair
--mode rebuild`` (15+ min on a 151K palace).
Only segments that pass stage 1 (suspiciously stale) AND fail stage
2 (metadata file truncated, zero-filled, or absent-with-data) are
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
renamed, not deleted, so recovery remains possible if the heuristic
misfires.
The default threshold (5 min) is advisory under daemon-strict; the
integrity gate is what actually distinguishes corruption from flush
lag. The threshold still matters for the cross-machine replication
case (#823), where it bounds how stale a Syncthing-replicated
segment can be before we look harder at it.
Args:
palace_path: path to the palace directory containing ``chroma.sqlite3``
stale_seconds: minimum mtime gap to *consider* a segment for quarantine
Returns:
List of paths that were quarantined (empty if nothing actually
looked corrupt).
The original directory is renamed, not deleted, so recovery remains
possible if the heuristic ever misfires.
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return []
try:
sqlite_mtime = os.path.getmtime(db_path)
except OSError:
return []
moved: list[str] = []
try:
entries = os.listdir(palace_path)
except OSError:
@@ -196,29 +215,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
for name in entries:
if "-" not in name or name.startswith(".") or ".drift-" in name:
continue
seg_dir = os.path.join(palace_path, name)
if not os.path.isdir(seg_dir):
continue
hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
if not os.path.isfile(hnsw_bin):
continue
try:
hnsw_mtime = os.path.getmtime(hnsw_bin)
except OSError:
continue
if sqlite_mtime - hnsw_mtime < stale_seconds:
payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
continue
# Stage 2: integrity gate. mtime drift is necessary but not
# sufficient — chromadb's async flush makes drift the steady-
# state condition. A healthy segment metadata file proves
# chromadb can open the segment without segfault; don't
# quarantine a healthy index.
if _segment_appears_healthy(seg_dir):
# Stage 2: integrity gate. Mtime drift alone is not corruption because
# Chroma flushes HNSW asynchronously. A healthy metadata file proves the
# ordinary stale-by-mtime case is just flush lag.
if not payload_corrupt and _segment_appears_healthy(seg_dir):
logger.info(
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
"metadata file is intact — flush-lag, not corruption. "
"Leaving in place.",
"metadata and payload size are intact — flush-lag, not "
"corruption. Leaving in place.",
sqlite_mtime - hnsw_mtime,
seg_dir,
)
@@ -226,17 +250,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
if payload_corrupt:
reason = (
f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
)
else:
reason = (
f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
"and integrity check failed"
)
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
"Quarantined corrupt HNSW segment %s (%s); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
reason,
target,
)
except OSError:
logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
return moved
+113
View File
@@ -0,0 +1,113 @@
import os
from pathlib import Path
from mempalace.backends.chroma import (
_HNSW_LINK_TO_DATA_MAX_RATIO,
_hnsw_link_to_data_ratio,
_segment_appears_healthy,
quarantine_stale_hnsw,
)
def _write_segment(
seg_dir: Path,
*,
data_size: int = 100,
link_size: int = 100,
write_metadata: bool = True,
) -> None:
seg_dir.mkdir(parents=True, exist_ok=True)
(seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
(seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
if write_metadata:
# Enough bytes to pass the existing pickle envelope sniff-test:
# starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
(seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
_write_segment(seg_dir, data_size=100, link_size=250)
assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
_write_segment(
seg_dir,
data_size=100,
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
write_metadata=True,
)
assert not _segment_appears_healthy(str(seg_dir))
def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
_write_segment(
seg_dir,
data_size=100,
link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
write_metadata=True,
)
assert _segment_appears_healthy(str(seg_dir))
def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
db_path.write_text("sqlite placeholder")
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
_write_segment(
seg_dir,
data_size=100,
link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
write_metadata=True,
)
# Make sqlite and HNSW mtimes identical. The old mtime-only gate would
# skip this segment even though the payload is structurally corrupt.
same_time = 1_700_000_000
os.utime(db_path, (same_time, same_time))
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
assert len(moved) == 1
assert not seg_dir.exists()
moved_path = Path(moved[0])
assert moved_path.exists()
assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
palace = tmp_path / "palace"
palace.mkdir()
db_path = palace / "chroma.sqlite3"
db_path.write_text("sqlite placeholder")
seg_dir = palace / "11111111-2222-3333-4444-555555555555"
_write_segment(
seg_dir,
data_size=100,
link_size=100,
write_metadata=True,
)
same_time = 1_700_000_000
os.utime(db_path, (same_time, same_time))
os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
assert moved == []
assert seg_dir.exists()