From ac6c2b6af6782668958732323969b78faa8b65be Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 1 May 2026 19:34:38 -0300 Subject: [PATCH 1/2] fix(mcp_server): pass embedding_function= on collection reopen (#1299) `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x does not persist the EF identity with the collection, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` while the miner / Stop hook ingest path bound `mempalace.embedding.get_embedding_function()`. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon, per #1299) the default EF's lazy ONNX provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. Reads worked because `col.get(ids=...)` and metadata fetches don't invoke the EF; the auto-ingest path worked because mining routes through the backend abstraction. Diary writes were the consistent failure surface. Resolve the EF up front (matching `ChromaBackend._resolve_embedding_function`) and pass it into both reopen branches. Falls back to the chromadb default only if `mempalace.embedding.get_embedding_function` itself raises. Regression test patches the chromadb client class to capture `embedding_function=` on every `get_collection` / `create_collection` call from `_get_collection(create=True)` and `_get_collection()`, and fails if any call omits it. Follow-up to #1262 / #1289 (which fixed the metadata-mismatch SIGSEGV path); this addresses the EF-mismatch SIGSEGV path on the same surface. --- CHANGELOG.md | 1 + mempalace/mcp_server.py | 22 ++++++++++++++-- tests/test_mcp_server.py | 56 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f51968..337a7a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ### Bug Fixes +- **MCP server `tool_diary_write` SIGSEGV when EF default differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x does not persist the EF identity with the collection, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` to the collection while the miner / Stop hook ingest path bound `mempalace.embedding.get_embedding_function()`. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default's lazy ONNX provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now resolves the EF via `mempalace.embedding.get_embedding_function` and passes it into both reopen branches, matching the miner/backend path. (#1299, follow-up to #1262 / #1289) - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180) - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap. - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179) diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index cce7a49..faa024c 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -66,6 +66,7 @@ from .backends.chroma import ( # noqa: E402 _pin_hnsw_threads, hnsw_capacity_status, ) +from .embedding import get_embedding_function # noqa: E402 from .query_sanitizer import sanitize_query # noqa: E402 from .searcher import search_memories # noqa: E402 from .palace_graph import ( # noqa: E402 @@ -278,6 +279,22 @@ def _get_collection(create=False): global _collection_cache, _metadata_cache, _metadata_cache_time try: client = _get_client() + # ChromaDB 1.x does not persist the embedding function with the + # collection, so a reader/writer that omits ``embedding_function=`` + # silently gets the chromadb-built-in default. On bleeding-edge + # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) + # the default's lazy ONNX provider selection can SIGSEGV the host + # process on first ``col.add()``. The miner / Stop hook ingest path + # avoids this because it routes through ``ChromaBackend.get_collection`` + # which resolves the EF via ``mempalace.embedding.get_embedding_function``. + # The MCP server bypassed that abstraction; mirror its behaviour so + # ``tool_diary_write`` / ``tool_add_drawer`` get the same EF as mining. + try: + ef = get_embedding_function() + except Exception: + logger.exception("Failed to build embedding function; using chromadb default") + ef = None + ef_kwargs = {"embedding_function": ef} if ef is not None else {} if create: # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor # HNSW insert path, which has a race in repairConnectionsForUpdate / @@ -292,7 +309,7 @@ def _get_collection(create=False): # below skips the metadata-comparison codepath for existing # collections, mirroring the backend-layer fix from #1262. try: - raw = client.get_collection(_config.collection_name) + raw = client.get_collection(_config.collection_name, **ef_kwargs) except _ChromaNotFoundError: raw = client.create_collection( _config.collection_name, @@ -301,13 +318,14 @@ def _get_collection(create=False): "hnsw:num_threads": 1, **_HNSW_BLOAT_GUARD, }, + **ef_kwargs, ) _pin_hnsw_threads(raw) _collection_cache = ChromaCollection(raw) _metadata_cache = None _metadata_cache_time = 0 elif _collection_cache is None: - raw = client.get_collection(_config.collection_name) + raw = client.get_collection(_config.collection_name, **ef_kwargs) _pin_hnsw_threads(raw) _collection_cache = ChromaCollection(raw) _metadata_cache = None diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 46e5f4a..f8148af 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -919,3 +919,59 @@ class TestCacheInvalidation: col2 = mcp_server._get_collection(create=True) assert col2 is not None assert calls == [], f"get_or_create_collection was called: {calls}" + + def test_get_collection_passes_embedding_function(self, monkeypatch, config, palace_path, kg): + """Regression for #1299. + + ``mcp_server._get_collection`` must pass ``embedding_function=`` into + both ``client.get_collection`` and ``client.create_collection``, + mirroring ``ChromaBackend.get_collection``. Without it, ChromaDB 1.x + falls back to its built-in ``DefaultEmbeddingFunction`` (whose lazy + ONNX provider selection has SIGSEGV'd on python 3.14 + Apple Silicon), + and writers/readers can disagree with the miner about which EF is + bound to the collection. The miner / Stop hook ingest path routes + through ``ChromaBackend.get_collection`` which does this correctly; + the MCP server must match. + """ + _patch_mcp_server(monkeypatch, config, kg) + from mempalace import mcp_server + + client = mcp_server._get_client() + client_cls = type(client) + captured: dict[str, list[dict]] = {"get": [], "create": []} + real_get = client_cls.get_collection + real_create = client_cls.create_collection + + def _spy_get(self, name, **kwargs): + captured["get"].append(dict(kwargs)) + return real_get(self, name, **kwargs) + + def _spy_create(self, name, **kwargs): + captured["create"].append(dict(kwargs)) + return real_create(self, name, **kwargs) + + monkeypatch.setattr(client_cls, "get_collection", _spy_get) + monkeypatch.setattr(client_cls, "create_collection", _spy_create) + mcp_server._collection_cache = None + + col = mcp_server._get_collection(create=True) + assert col is not None + + all_calls = captured["get"] + captured["create"] + assert all_calls, "expected get_collection or create_collection to be called" + for kwargs in all_calls: + assert ( + "embedding_function" in kwargs + ), f"missing embedding_function= in chromadb call: {kwargs}" + assert kwargs["embedding_function"] is not None + + # Same expectation on the create=False (cache-miss) reopen path. + mcp_server._collection_cache = None + captured["get"].clear() + captured["create"].clear() + col2 = mcp_server._get_collection() + assert col2 is not None + assert captured["get"], "expected get_collection on cache-miss reopen" + for kwargs in captured["get"]: + assert "embedding_function" in kwargs + assert kwargs["embedding_function"] is not None From cd98d6674e6cdb841146a17243f7e947bfeebfb8 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 1 May 2026 19:46:59 -0300 Subject: [PATCH 2/2] fix(mcp_server): address copilot review on #1303 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Resolve the EF inside the two reopen branches that actually call `client.get_collection` / `client.create_collection`, so warm-cache reads stay zero-cost (no `MempalaceConfig()` / `_resolve_providers` on every tool call). - Reuse `ChromaBackend._resolve_embedding_function()` instead of duplicating its try/except + log message + None-fallback. - Reword the inline + CHANGELOG explanation to clarify that ChromaDB 1.x persists the EF *identity* (its `name()`) but not the *instance/ configuration* — `mempalace.embedding` documents this and spoofs `name()` to `"default"` precisely so the identity check passes; the bug was the *provider list* (lazy ONNX selection) silently differing. --- CHANGELOG.md | 2 +- mempalace/mcp_server.py | 35 +++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 337a7a1..41dfaac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ### Bug Fixes -- **MCP server `tool_diary_write` SIGSEGV when EF default differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x does not persist the EF identity with the collection, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` to the collection while the miner / Stop hook ingest path bound `mempalace.embedding.get_embedding_function()`. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default's lazy ONNX provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now resolves the EF via `mempalace.embedding.get_embedding_function` and passes it into both reopen branches, matching the miner/backend path. (#1299, follow-up to #1262 / #1289) +- **MCP server `tool_diary_write` SIGSEGV when default EF provider differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x persists the EF *identity* (its `name()`) with the collection but not the EF *instance/configuration*, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` — its `name()` matches `mempalace.embedding`'s spoofed `"default"` so the identity check passes, but its provider list is chromadb's default rather than the user's resolved device. The miner / Stop hook ingest path routes through the backend helper and binds the configured EF instead. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now reuses `ChromaBackend._resolve_embedding_function()` on the reopen branches that actually open a collection (warm-cache reads stay zero-cost), matching the miner/backend path. (#1299, follow-up to #1262 / #1289) - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180) - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap. - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179) diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index faa024c..13654f6 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -66,7 +66,6 @@ from .backends.chroma import ( # noqa: E402 _pin_hnsw_threads, hnsw_capacity_status, ) -from .embedding import get_embedding_function # noqa: E402 from .query_sanitizer import sanitize_query # noqa: E402 from .searcher import search_memories # noqa: E402 from .palace_graph import ( # noqa: E402 @@ -279,23 +278,25 @@ def _get_collection(create=False): global _collection_cache, _metadata_cache, _metadata_cache_time try: client = _get_client() - # ChromaDB 1.x does not persist the embedding function with the - # collection, so a reader/writer that omits ``embedding_function=`` - # silently gets the chromadb-built-in default. On bleeding-edge + # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the + # collection but not the EF *instance/configuration*. So a reader or + # writer that omits ``embedding_function=`` silently gets chromadb's + # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the + # one we spoof in ``mempalace.embedding`` (both report ``"default"``, + # the identity check passes), but the *provider list* is chromadb's + # default rather than the user's resolved device. On bleeding-edge # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon) - # the default's lazy ONNX provider selection can SIGSEGV the host - # process on first ``col.add()``. The miner / Stop hook ingest path - # avoids this because it routes through ``ChromaBackend.get_collection`` - # which resolves the EF via ``mempalace.embedding.get_embedding_function``. - # The MCP server bypassed that abstraction; mirror its behaviour so - # ``tool_diary_write`` / ``tool_add_drawer`` get the same EF as mining. - try: - ef = get_embedding_function() - except Exception: - logger.exception("Failed to build embedding function; using chromadb default") - ef = None - ef_kwargs = {"embedding_function": ef} if ef is not None else {} + # that default provider selection can SIGSEGV the host process on + # first ``col.add()``. The miner / Stop hook ingest path avoids this + # because it routes through ``ChromaBackend.get_collection``, which + # resolves the EF via ``ChromaBackend._resolve_embedding_function``; + # the MCP server bypassed that abstraction. Resolve the EF inside the + # branches that actually open a collection so warm-cache reads stay + # zero-cost. Reuse the backend helper so the two call sites can't + # drift on logging or fallback semantics. if create: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor # HNSW insert path, which has a race in repairConnectionsForUpdate / # addPoint (see issues #974, #965). Set via metadata on fresh @@ -325,6 +326,8 @@ def _get_collection(create=False): _metadata_cache = None _metadata_cache_time = 0 elif _collection_cache is None: + ef = ChromaBackend._resolve_embedding_function() + ef_kwargs = {"embedding_function": ef} if ef is not None else {} raw = client.get_collection(_config.collection_name, **ef_kwargs) _pin_hnsw_threads(raw) _collection_cache = ChromaCollection(raw)