From e334e257bf4bb634cf4f31b4087f84f8b467938b Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Wed, 6 May 2026 04:52:18 -0300 Subject: [PATCH] fix(mcp): retry _get_collection once on transient failure (#1286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A transient chromadb exception inside `_get_collection` was swallowed by the bare `except Exception: return None`, leaving every subsequent tool call hitting the same poisoned cache silently. The fix wraps the body in a `for attempt in range(2)` loop: on attempt 0 failure, log via `logger.exception(...)` and clear `_client_cache` / `_collection_cache` / `_metadata_cache` so the next iteration forces `_get_client()` to rebuild from scratch — that path now re-runs `quarantine_stale_hnsw` (per #1322), so the second attempt heals the common stale-handle case automatically. If both attempts fail, return `None` (matches the prior contract for permanent failures). Two new tests in `tests/test_mcp_server.py::TestCacheInvalidation`: - `test_get_collection_retries_once_on_exception` — first attempt raises via a monkeypatched `_get_client`, second attempt succeeds; assert the caller gets the collection back, not None. - `test_get_collection_returns_none_after_two_failures` — both attempts fail, assert we exhaust the loop and return None (no infinite retry). Surgical extraction from PR #1286, which carried the same fix idea (plus a fork-sync bundle that couldn't be merged); credit to the original author below. Co-authored-by: Jeffrey Hein --- mempalace/mcp_server.py | 148 +++++++++++++++++++++++---------------- tests/test_mcp_server.py | 65 +++++++++++++++++ 2 files changed, 152 insertions(+), 61 deletions(-) diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index 58f9ba9..bbb9c93 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -326,68 +326,94 @@ def _get_client(): def _get_collection(create=False): - """Return the ChromaDB collection, caching the client between calls.""" - global _collection_cache, _metadata_cache, _metadata_cache_time - try: - client = _get_client() - # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the - # collection but not the EF *instance/configuration*. So a reader or - # writer that omits ``embedding_function=`` silently gets chromadb's - # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the - # one we spoof in ``mempalace.embedding`` (both report ``"default"``, - # the identity check passes), but the *provider list* is chromadb's - # default rather than the user's resolved device. On bleeding-edge - # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) - # that default provider selection can SIGSEGV the host process on - # first ``col.add()``. The miner / Stop hook ingest path avoids this - # because it routes through ``ChromaBackend.get_collection``, which - # resolves the EF via ``ChromaBackend._resolve_embedding_function``; - # the MCP server bypassed that abstraction. Resolve the EF inside the - # branches that actually open a collection so warm-cache reads stay - # zero-cost. Reuse the backend helper so the two call sites can't - # drift on logging or fallback semantics. - if create: - ef = ChromaBackend._resolve_embedding_function() - ef_kwargs = {"embedding_function": ef} if ef is not None else {} - # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor - # HNSW insert path, which has a race in repairConnectionsForUpdate / - # addPoint (see issues #974, #965). Set via metadata on fresh - # collections and re-applied via _pin_hnsw_threads() for legacy - # palaces whose collections were created before this fix (the - # runtime config does not persist cross-process in chromadb 1.5.x, - # so the retrofit runs every time _get_collection opens a cache). - # - # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection - # is called with metadata that differs from what's stored. The split - # below skips the metadata-comparison codepath for existing - # collections, mirroring the backend-layer fix from #1262. - try: + """Return the ChromaDB collection, caching the client between calls. + + On failure, log the exception and retry once after clearing the client + and collection caches. Tools were silently returning ``None`` when a + cached client/collection went stale — typically after the chromadb + rust bindings invalidated a handle following an out-of-band write — + leaving the LLM with no diagnostic and no recovery path. The retry + forces ``_get_client()`` to rebuild from scratch (which re-runs + ``quarantine_stale_hnsw`` per #1322), so the second attempt heals the + common stale-handle / stale-HNSW case automatically. + """ + global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time + for attempt in range(2): + try: + client = _get_client() + # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the + # collection but not the EF *instance/configuration*. So a reader or + # writer that omits ``embedding_function=`` silently gets chromadb's + # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the + # one we spoof in ``mempalace.embedding`` (both report ``"default"``, + # the identity check passes), but the *provider list* is chromadb's + # default rather than the user's resolved device. On bleeding-edge + # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) + # that default provider selection can SIGSEGV the host process on + # first ``col.add()``. The miner / Stop hook ingest path avoids this + # because it routes through ``ChromaBackend.get_collection``, which + # resolves the EF via ``ChromaBackend._resolve_embedding_function``; + # the MCP server bypassed that abstraction. Resolve the EF inside the + # branches that actually open a collection so warm-cache reads stay + # zero-cost. Reuse the backend helper so the two call sites can't + # drift on logging or fallback semantics. + if create: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} + # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor + # HNSW insert path, which has a race in repairConnectionsForUpdate / + # addPoint (see issues #974, #965). Set via metadata on fresh + # collections and re-applied via _pin_hnsw_threads() for legacy + # palaces whose collections were created before this fix (the + # runtime config does not persist cross-process in chromadb 1.5.x, + # so the retrofit runs every time _get_collection opens a cache). + # + # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection + # is called with metadata that differs from what's stored. The split + # below skips the metadata-comparison codepath for existing + # collections, mirroring the backend-layer fix from #1262. + try: + raw = client.get_collection(_config.collection_name, **ef_kwargs) + except _ChromaNotFoundError: + raw = client.create_collection( + _config.collection_name, + metadata={ + "hnsw:space": "cosine", + "hnsw:num_threads": 1, + **_HNSW_BLOAT_GUARD, + }, + **ef_kwargs, + ) + _pin_hnsw_threads(raw) + _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) + _metadata_cache = None + _metadata_cache_time = 0 + elif _collection_cache is None: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} raw = client.get_collection(_config.collection_name, **ef_kwargs) - except _ChromaNotFoundError: - raw = client.create_collection( - _config.collection_name, - metadata={ - "hnsw:space": "cosine", - "hnsw:num_threads": 1, - **_HNSW_BLOAT_GUARD, - }, - **ef_kwargs, - ) - _pin_hnsw_threads(raw) - _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) - _metadata_cache = None - _metadata_cache_time = 0 - elif _collection_cache is None: - ef = ChromaBackend._resolve_embedding_function() - ef_kwargs = {"embedding_function": ef} if ef is not None else {} - raw = client.get_collection(_config.collection_name, **ef_kwargs) - _pin_hnsw_threads(raw) - _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) - _metadata_cache = None - _metadata_cache_time = 0 - return _collection_cache - except Exception: - return None + _pin_hnsw_threads(raw) + _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) + _metadata_cache = None + _metadata_cache_time = 0 + return _collection_cache + except Exception: + logger.exception( + "_get_collection attempt %d/2 failed (palace=%s, create=%s)", + attempt + 1, + _config.palace_path, + create, + ) + if attempt == 0: + # Reset all caches so the next attempt forces _get_client() + # to rebuild the chromadb client from scratch — that path + # re-runs quarantine_stale_hnsw (#1322) and reopens the + # collection cleanly, healing the common stale-handle case. + _client_cache = None + _collection_cache = None + _metadata_cache = None + _metadata_cache_time = 0 + return None def _no_palace(): diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index c073830..ae20bf3 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -1259,6 +1259,71 @@ class TestCacheInvalidation: assert "embedding_function" in kwargs assert kwargs["embedding_function"] is not None + def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg): + """Regression: a transient failure inside _get_collection must trigger + one retry after clearing the client/collection caches, not silently + return None. + + Before this fix, a stale chromadb handle (e.g. the rust bindings + invalidating after an out-of-band write) would raise inside the + single ``try`` block, get swallowed by ``except Exception: return + None``, and every subsequent tool call would hit the same poisoned + cache returning None. The retry forces ``_get_client()`` to rebuild + the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so + the second attempt heals the common stale-handle case. + """ + _patch_mcp_server(monkeypatch, config, kg) + _client, _col = _get_collection(palace_path, create=True) + del _client + from mempalace import mcp_server + + # Force a cold cache so the first call goes through the open path. + mcp_server._client_cache = None + mcp_server._collection_cache = None + + real_get_client = mcp_server._get_client + attempts = {"count": 0} + + def flaky_get_client(): + attempts["count"] += 1 + if attempts["count"] == 1: + raise RuntimeError("simulated transient chromadb failure") + return real_get_client() + + monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client) + + col = mcp_server._get_collection() + + # Both attempts ran and the second succeeded. + assert attempts["count"] == 2 + assert col is not None + + def test_get_collection_returns_none_after_two_failures( + self, monkeypatch, config, palace_path, kg + ): + """If both attempts fail, return None (matches the prior contract for + permanent failures — only the transient case is now self-healing).""" + _patch_mcp_server(monkeypatch, config, kg) + _client, _col = _get_collection(palace_path, create=True) + del _client + from mempalace import mcp_server + + mcp_server._client_cache = None + mcp_server._collection_cache = None + + attempts = {"count": 0} + + def always_fails(): + attempts["count"] += 1 + raise RuntimeError("permanent chromadb failure") + + monkeypatch.setattr(mcp_server, "_get_client", always_fails) + + col = mcp_server._get_collection() + + assert attempts["count"] == 2 + assert col is None + class TestKGLazyCache: """Lazy per-path KnowledgeGraph cache (issue #1136)."""