fix(mcp): retry _get_collection once on transient failure (#1286)

A transient chromadb exception inside `_get_collection` was swallowed by
the bare `except Exception: return None`, leaving every subsequent tool
call hitting the same poisoned cache silently. The fix wraps the body
in a `for attempt in range(2)` loop: on attempt 0 failure, log via
`logger.exception(...)` and clear `_client_cache` / `_collection_cache`
/ `_metadata_cache` so the next iteration forces `_get_client()` to
rebuild from scratch — that path now re-runs `quarantine_stale_hnsw`
(per #1322), so the second attempt heals the common stale-handle case
automatically. If both attempts fail, return `None` (matches the prior
contract for permanent failures).

Two new tests in `tests/test_mcp_server.py::TestCacheInvalidation`:
- `test_get_collection_retries_once_on_exception` — first attempt raises
  via a monkeypatched `_get_client`, second attempt succeeds; assert the
  caller gets the collection back, not None.
- `test_get_collection_returns_none_after_two_failures` — both attempts
  fail, assert we exhaust the loop and return None (no infinite retry).

Surgical extraction from PR #1286, which carried the same fix idea
(plus a fork-sync bundle that couldn't be merged); credit to the
original author below.

Co-authored-by: Jeffrey Hein <jp@jphein.com>
This commit is contained in:
Igor Lins e Silva
2026-05-06 04:52:18 -03:00
parent 6741b6908e
commit e334e257bf
2 changed files with 152 additions and 61 deletions
+87 -61
View File
@@ -326,68 +326,94 @@ def _get_client():
def _get_collection(create=False): def _get_collection(create=False):
"""Return the ChromaDB collection, caching the client between calls.""" """Return the ChromaDB collection, caching the client between calls.
global _collection_cache, _metadata_cache, _metadata_cache_time
try: On failure, log the exception and retry once after clearing the client
client = _get_client() and collection caches. Tools were silently returning ``None`` when a
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the cached client/collection went stale — typically after the chromadb
# collection but not the EF *instance/configuration*. So a reader or rust bindings invalidated a handle following an out-of-band write —
# writer that omits ``embedding_function=`` silently gets chromadb's leaving the LLM with no diagnostic and no recovery path. The retry
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the forces ``_get_client()`` to rebuild from scratch (which re-runs
# one we spoof in ``mempalace.embedding`` (both report ``"default"``, ``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
# the identity check passes), but the *provider list* is chromadb's common stale-handle / stale-HNSW case automatically.
# default rather than the user's resolved device. On bleeding-edge """
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
# that default provider selection can SIGSEGV the host process on for attempt in range(2):
# first ``col.add()``. The miner / Stop hook ingest path avoids this try:
# because it routes through ``ChromaBackend.get_collection``, which client = _get_client()
# resolves the EF via ``ChromaBackend._resolve_embedding_function``; # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
# the MCP server bypassed that abstraction. Resolve the EF inside the # collection but not the EF *instance/configuration*. So a reader or
# branches that actually open a collection so warm-cache reads stay # writer that omits ``embedding_function=`` silently gets chromadb's
# zero-cost. Reuse the backend helper so the two call sites can't # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
# drift on logging or fallback semantics. # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
if create: # the identity check passes), but the *provider list* is chromadb's
ef = ChromaBackend._resolve_embedding_function() # default rather than the user's resolved device. On bleeding-edge
ef_kwargs = {"embedding_function": ef} if ef is not None else {} # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor # that default provider selection can SIGSEGV the host process on
# HNSW insert path, which has a race in repairConnectionsForUpdate / # first ``col.add()``. The miner / Stop hook ingest path avoids this
# addPoint (see issues #974, #965). Set via metadata on fresh # because it routes through ``ChromaBackend.get_collection``, which
# collections and re-applied via _pin_hnsw_threads() for legacy # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
# palaces whose collections were created before this fix (the # the MCP server bypassed that abstraction. Resolve the EF inside the
# runtime config does not persist cross-process in chromadb 1.5.x, # branches that actually open a collection so warm-cache reads stay
# so the retrofit runs every time _get_collection opens a cache). # zero-cost. Reuse the backend helper so the two call sites can't
# # drift on logging or fallback semantics.
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection if create:
# is called with metadata that differs from what's stored. The split ef = ChromaBackend._resolve_embedding_function()
# below skips the metadata-comparison codepath for existing ef_kwargs = {"embedding_function": ef} if ef is not None else {}
# collections, mirroring the backend-layer fix from #1262. # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
try: # HNSW insert path, which has a race in repairConnectionsForUpdate /
# addPoint (see issues #974, #965). Set via metadata on fresh
# collections and re-applied via _pin_hnsw_threads() for legacy
# palaces whose collections were created before this fix (the
# runtime config does not persist cross-process in chromadb 1.5.x,
# so the retrofit runs every time _get_collection opens a cache).
#
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
# is called with metadata that differs from what's stored. The split
# below skips the metadata-comparison codepath for existing
# collections, mirroring the backend-layer fix from #1262.
try:
raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError:
raw = client.create_collection(
_config.collection_name,
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
ef = ChromaBackend._resolve_embedding_function()
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
raw = client.get_collection(_config.collection_name, **ef_kwargs) raw = client.get_collection(_config.collection_name, **ef_kwargs)
except _ChromaNotFoundError: _pin_hnsw_threads(raw)
raw = client.create_collection( _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
_config.collection_name, _metadata_cache = None
metadata={ _metadata_cache_time = 0
"hnsw:space": "cosine", return _collection_cache
"hnsw:num_threads": 1, except Exception:
**_HNSW_BLOAT_GUARD, logger.exception(
}, "_get_collection attempt %d/2 failed (palace=%s, create=%s)",
**ef_kwargs, attempt + 1,
) _config.palace_path,
_pin_hnsw_threads(raw) create,
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) )
_metadata_cache = None if attempt == 0:
_metadata_cache_time = 0 # Reset all caches so the next attempt forces _get_client()
elif _collection_cache is None: # to rebuild the chromadb client from scratch — that path
ef = ChromaBackend._resolve_embedding_function() # re-runs quarantine_stale_hnsw (#1322) and reopens the
ef_kwargs = {"embedding_function": ef} if ef is not None else {} # collection cleanly, healing the common stale-handle case.
raw = client.get_collection(_config.collection_name, **ef_kwargs) _client_cache = None
_pin_hnsw_threads(raw) _collection_cache = None
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path) _metadata_cache = None
_metadata_cache = None _metadata_cache_time = 0
_metadata_cache_time = 0 return None
return _collection_cache
except Exception:
return None
def _no_palace(): def _no_palace():
+65
View File
@@ -1259,6 +1259,71 @@ class TestCacheInvalidation:
assert "embedding_function" in kwargs assert "embedding_function" in kwargs
assert kwargs["embedding_function"] is not None assert kwargs["embedding_function"] is not None
def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
"""Regression: a transient failure inside _get_collection must trigger
one retry after clearing the client/collection caches, not silently
return None.
Before this fix, a stale chromadb handle (e.g. the rust bindings
invalidating after an out-of-band write) would raise inside the
single ``try`` block, get swallowed by ``except Exception: return
None``, and every subsequent tool call would hit the same poisoned
cache returning None. The retry forces ``_get_client()`` to rebuild
the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
the second attempt heals the common stale-handle case.
"""
_patch_mcp_server(monkeypatch, config, kg)
_client, _col = _get_collection(palace_path, create=True)
del _client
from mempalace import mcp_server
# Force a cold cache so the first call goes through the open path.
mcp_server._client_cache = None
mcp_server._collection_cache = None
real_get_client = mcp_server._get_client
attempts = {"count": 0}
def flaky_get_client():
attempts["count"] += 1
if attempts["count"] == 1:
raise RuntimeError("simulated transient chromadb failure")
return real_get_client()
monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
col = mcp_server._get_collection()
# Both attempts ran and the second succeeded.
assert attempts["count"] == 2
assert col is not None
def test_get_collection_returns_none_after_two_failures(
self, monkeypatch, config, palace_path, kg
):
"""If both attempts fail, return None (matches the prior contract for
permanent failures — only the transient case is now self-healing)."""
_patch_mcp_server(monkeypatch, config, kg)
_client, _col = _get_collection(palace_path, create=True)
del _client
from mempalace import mcp_server
mcp_server._client_cache = None
mcp_server._collection_cache = None
attempts = {"count": 0}
def always_fails():
attempts["count"] += 1
raise RuntimeError("permanent chromadb failure")
monkeypatch.setattr(mcp_server, "_get_client", always_fails)
col = mcp_server._get_collection()
assert attempts["count"] == 2
assert col is None
class TestKGLazyCache: class TestKGLazyCache:
"""Lazy per-path KnowledgeGraph cache (issue #1136).""" """Lazy per-path KnowledgeGraph cache (issue #1136)."""