Merge pull request #1377 from MemPalace/fix/get-collection-retry-on-exception
fix(mcp): retry _get_collection once on transient failure (#1286)
This commit is contained in:
+87
-61
@@ -326,68 +326,94 @@ def _get_client():
|
||||
|
||||
|
||||
def _get_collection(create=False):
|
||||
"""Return the ChromaDB collection, caching the client between calls."""
|
||||
global _collection_cache, _metadata_cache, _metadata_cache_time
|
||||
try:
|
||||
client = _get_client()
|
||||
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
||||
# collection but not the EF *instance/configuration*. So a reader or
|
||||
# writer that omits ``embedding_function=`` silently gets chromadb's
|
||||
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
|
||||
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
|
||||
# the identity check passes), but the *provider list* is chromadb's
|
||||
# default rather than the user's resolved device. On bleeding-edge
|
||||
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
|
||||
# that default provider selection can SIGSEGV the host process on
|
||||
# first ``col.add()``. The miner / Stop hook ingest path avoids this
|
||||
# because it routes through ``ChromaBackend.get_collection``, which
|
||||
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
|
||||
# the MCP server bypassed that abstraction. Resolve the EF inside the
|
||||
# branches that actually open a collection so warm-cache reads stay
|
||||
# zero-cost. Reuse the backend helper so the two call sites can't
|
||||
# drift on logging or fallback semantics.
|
||||
if create:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
|
||||
# HNSW insert path, which has a race in repairConnectionsForUpdate /
|
||||
# addPoint (see issues #974, #965). Set via metadata on fresh
|
||||
# collections and re-applied via _pin_hnsw_threads() for legacy
|
||||
# palaces whose collections were created before this fix (the
|
||||
# runtime config does not persist cross-process in chromadb 1.5.x,
|
||||
# so the retrofit runs every time _get_collection opens a cache).
|
||||
#
|
||||
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
|
||||
# is called with metadata that differs from what's stored. The split
|
||||
# below skips the metadata-comparison codepath for existing
|
||||
# collections, mirroring the backend-layer fix from #1262.
|
||||
try:
|
||||
"""Return the ChromaDB collection, caching the client between calls.
|
||||
|
||||
On failure, log the exception and retry once after clearing the client
|
||||
and collection caches. Tools were silently returning ``None`` when a
|
||||
cached client/collection went stale — typically after the chromadb
|
||||
rust bindings invalidated a handle following an out-of-band write —
|
||||
leaving the LLM with no diagnostic and no recovery path. The retry
|
||||
forces ``_get_client()`` to rebuild from scratch (which re-runs
|
||||
``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
|
||||
common stale-handle / stale-HNSW case automatically.
|
||||
"""
|
||||
global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
|
||||
for attempt in range(2):
|
||||
try:
|
||||
client = _get_client()
|
||||
# ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
|
||||
# collection but not the EF *instance/configuration*. So a reader or
|
||||
# writer that omits ``embedding_function=`` silently gets chromadb's
|
||||
# built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
|
||||
# one we spoof in ``mempalace.embedding`` (both report ``"default"``,
|
||||
# the identity check passes), but the *provider list* is chromadb's
|
||||
# default rather than the user's resolved device. On bleeding-edge
|
||||
# interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
|
||||
# that default provider selection can SIGSEGV the host process on
|
||||
# first ``col.add()``. The miner / Stop hook ingest path avoids this
|
||||
# because it routes through ``ChromaBackend.get_collection``, which
|
||||
# resolves the EF via ``ChromaBackend._resolve_embedding_function``;
|
||||
# the MCP server bypassed that abstraction. Resolve the EF inside the
|
||||
# branches that actually open a collection so warm-cache reads stay
|
||||
# zero-cost. Reuse the backend helper so the two call sites can't
|
||||
# drift on logging or fallback semantics.
|
||||
if create:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
# hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
|
||||
# HNSW insert path, which has a race in repairConnectionsForUpdate /
|
||||
# addPoint (see issues #974, #965). Set via metadata on fresh
|
||||
# collections and re-applied via _pin_hnsw_threads() for legacy
|
||||
# palaces whose collections were created before this fix (the
|
||||
# runtime config does not persist cross-process in chromadb 1.5.x,
|
||||
# so the retrofit runs every time _get_collection opens a cache).
|
||||
#
|
||||
# ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
|
||||
# is called with metadata that differs from what's stored. The split
|
||||
# below skips the metadata-comparison codepath for existing
|
||||
# collections, mirroring the backend-layer fix from #1262.
|
||||
try:
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
except _ChromaNotFoundError:
|
||||
raw = client.create_collection(
|
||||
_config.collection_name,
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
elif _collection_cache is None:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
except _ChromaNotFoundError:
|
||||
raw = client.create_collection(
|
||||
_config.collection_name,
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
elif _collection_cache is None:
|
||||
ef = ChromaBackend._resolve_embedding_function()
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
raw = client.get_collection(_config.collection_name, **ef_kwargs)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return _collection_cache
|
||||
except Exception:
|
||||
return None
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return _collection_cache
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"_get_collection attempt %d/2 failed (palace=%s, create=%s)",
|
||||
attempt + 1,
|
||||
_config.palace_path,
|
||||
create,
|
||||
)
|
||||
if attempt == 0:
|
||||
# Reset all caches so the next attempt forces _get_client()
|
||||
# to rebuild the chromadb client from scratch — that path
|
||||
# re-runs quarantine_stale_hnsw (#1322) and reopens the
|
||||
# collection cleanly, healing the common stale-handle case.
|
||||
_client_cache = None
|
||||
_collection_cache = None
|
||||
_metadata_cache = None
|
||||
_metadata_cache_time = 0
|
||||
return None
|
||||
|
||||
|
||||
def _no_palace():
|
||||
|
||||
@@ -1259,6 +1259,71 @@ class TestCacheInvalidation:
|
||||
assert "embedding_function" in kwargs
|
||||
assert kwargs["embedding_function"] is not None
|
||||
|
||||
def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
|
||||
"""Regression: a transient failure inside _get_collection must trigger
|
||||
one retry after clearing the client/collection caches, not silently
|
||||
return None.
|
||||
|
||||
Before this fix, a stale chromadb handle (e.g. the rust bindings
|
||||
invalidating after an out-of-band write) would raise inside the
|
||||
single ``try`` block, get swallowed by ``except Exception: return
|
||||
None``, and every subsequent tool call would hit the same poisoned
|
||||
cache returning None. The retry forces ``_get_client()`` to rebuild
|
||||
the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
|
||||
the second attempt heals the common stale-handle case.
|
||||
"""
|
||||
_patch_mcp_server(monkeypatch, config, kg)
|
||||
_client, _col = _get_collection(palace_path, create=True)
|
||||
del _client
|
||||
from mempalace import mcp_server
|
||||
|
||||
# Force a cold cache so the first call goes through the open path.
|
||||
mcp_server._client_cache = None
|
||||
mcp_server._collection_cache = None
|
||||
|
||||
real_get_client = mcp_server._get_client
|
||||
attempts = {"count": 0}
|
||||
|
||||
def flaky_get_client():
|
||||
attempts["count"] += 1
|
||||
if attempts["count"] == 1:
|
||||
raise RuntimeError("simulated transient chromadb failure")
|
||||
return real_get_client()
|
||||
|
||||
monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
|
||||
|
||||
col = mcp_server._get_collection()
|
||||
|
||||
# Both attempts ran and the second succeeded.
|
||||
assert attempts["count"] == 2
|
||||
assert col is not None
|
||||
|
||||
def test_get_collection_returns_none_after_two_failures(
|
||||
self, monkeypatch, config, palace_path, kg
|
||||
):
|
||||
"""If both attempts fail, return None (matches the prior contract for
|
||||
permanent failures — only the transient case is now self-healing)."""
|
||||
_patch_mcp_server(monkeypatch, config, kg)
|
||||
_client, _col = _get_collection(palace_path, create=True)
|
||||
del _client
|
||||
from mempalace import mcp_server
|
||||
|
||||
mcp_server._client_cache = None
|
||||
mcp_server._collection_cache = None
|
||||
|
||||
attempts = {"count": 0}
|
||||
|
||||
def always_fails():
|
||||
attempts["count"] += 1
|
||||
raise RuntimeError("permanent chromadb failure")
|
||||
|
||||
monkeypatch.setattr(mcp_server, "_get_client", always_fails)
|
||||
|
||||
col = mcp_server._get_collection()
|
||||
|
||||
assert attempts["count"] == 2
|
||||
assert col is None
|
||||
|
||||
|
||||
class TestKGLazyCache:
|
||||
"""Lazy per-path KnowledgeGraph cache (issue #1136)."""
|
||||
|
||||
Reference in New Issue
Block a user