From 4a0f330cc14ea0a6ff69936d4bd51664623580df Mon Sep 17 00:00:00 2001 From: Pim Messelink <274674234+messelink@users.noreply.github.com> Date: Fri, 1 May 2026 00:31:47 +0000 Subject: [PATCH] fix(repair): scale HNSW divergence floor with hnsw:sync_threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The capacity probe added in #1227 hardcoded a 2,000-row floor for the "diverged" decision. The comment justifying that number explicitly tied it to chromadb's *default* sync_threshold of 1,000 — "Two synchronization windows worth (2 × sync_threshold = 2000) is a safe steady-state ceiling". #1191 then bumped sync_threshold to 50,000 via _HNSW_BLOAT_GUARD without updating the floor. Result: any palace created with the bloat guard flips between OK and DIVERGED on every flush cycle. Steady-state divergence sits at 0–50K (the natural queue depth), and the 2,000 floor trips the guardrail the moment the queue exceeds 10% of sqlite_count. The MCP server then routes search to BM25-only and disables duplicate detection for ~80% of the write cycle on actively-mined ≥100K palaces, even though chromadb is behaving correctly. This change reads the configured `hnsw:sync_threshold` from `collection_metadata` per palace and scales the floor to 2 × that value. The 10% relative term and the original #1222 detection capability are unchanged — a 91%-missing-of-192K palace (the actual #1222 reproducer) still trips, regardless of whether the collection was created with sync_threshold=1000 or 50000. Behavior summary: | Collection's sync_threshold | New floor | Old floor | |---|---|---| | Missing (legacy palace) | 2000 | 2000 (unchanged) | 1000 (chromadb default) | 2000 | 2000 (unchanged) | 50000 (#1191 bloat guard) | 100000 | 2000 (the bug) Tests: - test_capacity_status_tolerates_lag_under_large_sync_threshold (regression for the #1191/#1227 conflict — 100K sqlite + 50K HNSW + sync=50K → OK) - test_capacity_status_still_flags_real_corruption_under_large_sync (#1222 shape with bloat-guard collection — still detects corruption) - test_capacity_status_default_threshold_when_no_sync_metadata (legacy palaces without the metadata row use the 2000 fallback floor) - test_unflushed_path_also_uses_dynamic_floor (the never-flushed branch scales too — 30K under sync_threshold=50000 is no longer flagged) All 18 pre-existing tests in tests/test_hnsw_capacity.py and 45 tests in tests/test_backends.py still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 80 ++++++++++++++++++++++---- tests/test_hnsw_capacity.py | 105 +++++++++++++++++++++++++++++++++-- 2 files changed, 169 insertions(+), 16 deletions(-) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 048f760..ed52154 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -372,20 +372,71 @@ def _hnsw_element_count(palace_path: str, segment_id: str) -> Optional[int]: # Divergence threshold: chromadb's HNSW flushes asynchronously, so HNSW -# typically lags sqlite by up to ``sync_threshold`` (default 1000) records -# under active write load — that's the *brute-force batch* that hasn't -# been compacted into HNSW yet, plus the un-persisted tail beyond the -# last sync. Two synchronization windows worth (2 × sync_threshold = 2000) -# is a safe steady-state ceiling; anything past that is real divergence, -# not flush-lag. +# typically lags sqlite by up to ``sync_threshold`` records under active +# write load — that's the *brute-force batch* that hasn't been compacted +# into HNSW yet, plus the un-persisted tail beyond the last sync. Two +# synchronization windows worth (2 × sync_threshold) is a safe steady- +# state ceiling; anything past that is real divergence, not flush-lag. # -# The #1222 case was 176 613 missing out of 192 997 (91% gone) — orders -# of magnitude past 2000. A typical post-mine palace shows a few hundred -# to ~1000 missing, well under threshold. -_HNSW_DIVERGENCE_ABSOLUTE = 2000 +# The threshold floor scales with whatever ``hnsw:sync_threshold`` the +# collection was created with (read via :func:`_read_sync_threshold`). +# ``_HNSW_DIVERGENCE_FALLBACK_FLOOR`` is the floor used when we can't +# read the collection metadata (older palaces missing the row, sqlite +# unreadable). 2000 = 2 × chromadb's default sync_threshold of 1000. +# +# Why dynamic: PR #1191 set ``hnsw:sync_threshold = 50_000`` to prevent +# index bloat, which means flush-lag can grow up to 50K naturally. A +# fixed 2000 floor would flag every actively-written palace as DIVERGED +# the moment its queue exceeded 10% of sqlite_count, even though chromadb +# is behaving correctly. The floor must scale with sync_threshold to +# distinguish real corruption (#1222 was 176 613 missing of 192 997 — +# orders of magnitude past 2 × any reasonable sync_threshold) from +# expected steady-state lag. +_HNSW_DIVERGENCE_FALLBACK_FLOOR = 2000 _HNSW_DIVERGENCE_FRACTION = 0.10 +def _read_sync_threshold(palace_path: str, collection_name: str) -> int: + """Return the ``hnsw:sync_threshold`` for a collection, or 1000 default. + + The configured sync_threshold drives chromadb's HNSW flush cadence — + larger values mean fewer, bigger flushes (less index-bloat risk per + PR #1191) but also larger steady-state lag between + ``index_metadata.pickle`` and the live sqlite count. The divergence + probe scales its tolerance to ``2 × sync_threshold`` so that lag is + not mistaken for corruption. + + Falls back to 1000 (chromadb's own default) if the collection has no + explicit setting — matches what older mempalace palaces were created + with before PR #1191. + """ + db_path = os.path.join(palace_path, "chroma.sqlite3") + if not os.path.isfile(db_path): + return 1000 + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + try: + cur = conn.cursor() + cur.execute( + """ + SELECT cm.int_value + FROM collection_metadata cm + JOIN collections c ON cm.collection_id = c.id + WHERE c.name = ? AND cm.key = 'hnsw:sync_threshold' + """, + (collection_name,), + ) + row = cur.fetchone() + if row and row[0] is not None: + return int(row[0]) + return 1000 + finally: + conn.close() + except Exception: + logger.debug("_read_sync_threshold failed", exc_info=True) + return 1000 + + def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_drawers") -> dict: """Compare sqlite embedding count against HNSW element count. @@ -431,13 +482,18 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra hnsw_count = _hnsw_element_count(palace_path, seg_id) out["hnsw_count"] = hnsw_count + sync_threshold = _read_sync_threshold(palace_path, collection_name) + # Two synchronization windows worth — see comment above + # _HNSW_DIVERGENCE_FALLBACK_FLOOR for the rationale. + divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold) + if hnsw_count is None: # No pickle yet — segment hasn't persisted metadata. Could be # fresh-but-unflushed (normal) or interrupted-mid-flush (bad). # We can't distinguish without the pickle, so only flag # divergence when sqlite holds clearly more than two flush # windows worth — same threshold as the with-pickle path. - if sqlite_count > _HNSW_DIVERGENCE_ABSOLUTE: + if sqlite_count > divergence_floor: out["status"] = "diverged" out["diverged"] = True out["divergence"] = sqlite_count @@ -452,7 +508,7 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra divergence = sqlite_count - hnsw_count out["divergence"] = divergence - threshold = max(_HNSW_DIVERGENCE_ABSOLUTE, int(sqlite_count * _HNSW_DIVERGENCE_FRACTION)) + threshold = max(divergence_floor, int(sqlite_count * _HNSW_DIVERGENCE_FRACTION)) if divergence > threshold: out["status"] = "diverged" out["diverged"] = True diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py index 0b811f5..512fc9c 100644 --- a/tests/test_hnsw_capacity.py +++ b/tests/test_hnsw_capacity.py @@ -28,13 +28,23 @@ COLLECTION = "mempalace_drawers" # ── Fixtures ────────────────────────────────────────────────────────── -def _seed_chroma_db(palace: str, sqlite_count: int, segment_id: str) -> None: +def _seed_chroma_db( + palace: str, + sqlite_count: int, + segment_id: str, + sync_threshold: int | None = None, +) -> None: """Create a minimal chroma.sqlite3 with one collection + VECTOR segment. Mirrors the columns the probe queries: ``segments``, ``collections``, - ``embeddings``, ``embedding_metadata``. Schema matches chromadb - 1.5.x; column types are kept loose because we read with COUNT(*) and - SELECT key, *_value rather than driver-specific casts. + ``collection_metadata``, ``embeddings``, ``embedding_metadata``. + Schema matches chromadb 1.5.x; column types are kept loose because + we read with COUNT(*) and SELECT key, *_value rather than driver- + specific casts. + + When ``sync_threshold`` is supplied, an ``hnsw:sync_threshold`` row + is added to ``collection_metadata`` so the divergence floor scales + accordingly. Omit to model an older palace that pre-dates PR #1191. """ db_path = os.path.join(palace, "chroma.sqlite3") conn = sqlite3.connect(db_path) @@ -45,6 +55,15 @@ def _seed_chroma_db(palace: str, sqlite_count: int, segment_id: str) -> None: id TEXT PRIMARY KEY, name TEXT NOT NULL ); + CREATE TABLE collection_metadata ( + collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE, + key TEXT NOT NULL, + str_value TEXT, + int_value INTEGER, + float_value REAL, + bool_value INTEGER, + PRIMARY KEY (collection_id, key) + ); CREATE TABLE segments ( id TEXT PRIMARY KEY, collection TEXT NOT NULL, @@ -73,6 +92,12 @@ def _seed_chroma_db(palace: str, sqlite_count: int, segment_id: str) -> None: col_id = "col-test" meta_seg = "seg-meta" conn.execute("INSERT INTO collections (id, name) VALUES (?, ?)", (col_id, COLLECTION)) + if sync_threshold is not None: + conn.execute( + """INSERT INTO collection_metadata (collection_id, key, int_value) + VALUES (?, 'hnsw:sync_threshold', ?)""", + (col_id, sync_threshold), + ) conn.execute( "INSERT INTO segments (id, collection, scope) VALUES (?, ?, 'VECTOR')", (segment_id, col_id), @@ -229,6 +254,78 @@ def test_capacity_status_quiet_for_empty_palace(tmp_path): assert info["status"] == "unknown" +# ── Divergence threshold scales with hnsw:sync_threshold ─────────────── + + +def test_capacity_status_tolerates_lag_under_large_sync_threshold(tmp_path): + """Regression for the PR #1191 / PR #1227 conflict. + + Palaces created via mempalace's _HNSW_BLOAT_GUARD (sync_threshold= + 50_000) naturally accumulate up to ~50K queued entries between + flushes. The pickle-vs-sqlite probe must scale its tolerance to + ``2 × sync_threshold`` so this expected lag is not flagged as + corruption — otherwise vector search disables for ~80% of the + write cycle on any actively-mined ≥100K palace. + """ + seg = "seg-bloat-guard" + _seed_chroma_db(str(tmp_path), sqlite_count=100_000, segment_id=seg, sync_threshold=50_000) + _write_pickle(str(tmp_path), seg, hnsw_count=50_000) + info = hnsw_capacity_status(str(tmp_path), COLLECTION) + # 50K divergence is exactly one flush window — well within 2× = 100K. + assert info["diverged"] is False, info["message"] + assert info["status"] == "ok" + assert info["divergence"] == 50_000 + + +def test_capacity_status_still_flags_real_corruption_under_large_sync(tmp_path): + """The dynamic floor must still catch genuine #1222-style corruption. + + sqlite at 200K with HNSW frozen at 16K is the original #1222 shape — + any reasonable threshold should flag it, regardless of whether the + collection was created with sync_threshold=1000 or 50_000. + """ + seg = "seg-1222-with-bloat-guard" + _seed_chroma_db(str(tmp_path), sqlite_count=200_000, segment_id=seg, sync_threshold=50_000) + _write_pickle(str(tmp_path), seg, hnsw_count=16_384) + info = hnsw_capacity_status(str(tmp_path), COLLECTION) + # 183,616 missing — far past 2 × 50K = 100K floor and 10% of 200K = 20K. + assert info["diverged"] is True + assert info["status"] == "diverged" + assert info["divergence"] == 183_616 + + +def test_capacity_status_default_threshold_when_no_sync_metadata(tmp_path): + """Older palaces without ``hnsw:sync_threshold`` fall back to 2000 floor. + + Pre-PR-#1191 collections only carry ``hnsw:space``. The probe must + use chromadb's own default sync_threshold of 1000 → floor of 2000, + matching pre-fix behavior. + """ + seg = "seg-legacy" + # No sync_threshold supplied — collection_metadata stays empty. + _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg) + _write_pickle(str(tmp_path), seg, hnsw_count=7_500) + info = hnsw_capacity_status(str(tmp_path), COLLECTION) + # 2,500 divergence > max(2000 floor, 10% of 10K = 1000) → DIVERGED + assert info["diverged"] is True + assert info["divergence"] == 2_500 + + +def test_unflushed_path_also_uses_dynamic_floor(tmp_path): + """The never-flushed branch must scale with sync_threshold too. + + A 30K-drawer collection under sync_threshold=50_000 hasn't reached + its first flush yet — pickle is absent. Pre-fix this would flag + DIVERGED (30K > fixed 2000 floor); post-fix the 30K stays under + the dynamic 100K floor. + """ + seg = "seg-preflush-large" + _seed_chroma_db(str(tmp_path), sqlite_count=30_000, segment_id=seg, sync_threshold=50_000) + info = hnsw_capacity_status(str(tmp_path), COLLECTION) + assert info["hnsw_count"] is None + assert info["diverged"] is False, info["message"] + + # ── BM25-only sqlite fallback ─────────────────────────────────────────