fix(mcp): retry KG handlers once on concurrent close race

Race scenario: a KG tool handler calls _get_kg() and gets a live
KnowledgeGraph; another thread fires tool_reconnect() between that
return and the handler's kg.add_triple()/kg.query_entity()/etc call.
tool_reconnect drains _kg_by_path and closes the underlying
sqlite3.Connection; the handler then raises sqlite3.ProgrammingError:
'Cannot operate on a closed database', which surfaces as a -32000
to the MCP client even though the user just asked for a reconnect.

New _call_kg(op) helper wraps each handler's kg call in a one-shot
retry: catch exactly sqlite3.ProgrammingError, evict the stale entry
(only if the cache slot still points at the closed instance — another
thread may have already replaced it), and rerun op against a fresh
_get_kg(). Beyond one retry give up so a sustained close-stream
surfaces clearly instead of looping.

All five KG handlers (tool_kg_query, tool_kg_add, tool_kg_invalidate,
tool_kg_timeline, tool_kg_stats) now route through _call_kg.

Tests pin the contract:
  * retries with a fresh KG and returns the second result
  * non-ProgrammingError exceptions propagate without retry
  * gives up after exactly one retry on sustained close
This commit is contained in:
mvalentsev
2026-05-03 21:43:51 +05:00
parent 0a62658051
commit b8816e0fe2
2 changed files with 125 additions and 13 deletions
+77
View File
@@ -1295,3 +1295,80 @@ class TestKGLazyCache:
mcp_server.tool_reconnect()
assert mcp_server._kg_by_path == {}
def test_call_kg_retries_after_concurrent_close(self, monkeypatch):
"""A KG closed mid-handler must trigger a one-shot retry with a fresh
instance — not surface a -32000 to the MCP client."""
import sqlite3 as _sqlite3
from mempalace import mcp_server
path = "/fake/palace/knowledge_graph.sqlite3"
monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
class _ClosedKG:
def query_entity(self, entity, **kwargs):
raise _sqlite3.ProgrammingError("Cannot operate on a closed database")
class _FreshKG:
def query_entity(self, entity, **kwargs):
return [{"entity": entity}]
cache = {os.path.abspath(path): _ClosedKG()}
monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
# Second _get_kg() call (after the cache eviction) constructs a new
# KG. Patch the constructor so we don't open a real sqlite file.
monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FreshKG())
result = mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
assert result == [{"entity": "Alice"}]
# The closed instance must be evicted; the fresh one must be cached.
assert isinstance(cache[os.path.abspath(path)], _FreshKG)
def test_call_kg_does_not_retry_on_other_errors(self, monkeypatch):
"""Non-ProgrammingError exceptions must propagate without retry —
we don't want the retry guard masking real bugs."""
from mempalace import mcp_server
path = "/fake/palace/knowledge_graph.sqlite3"
monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
calls = {"count": 0}
class _FailingKG:
def query_entity(self, entity, **kwargs):
calls["count"] += 1
raise ValueError("bad input")
monkeypatch.setattr(mcp_server, "_kg_by_path", {os.path.abspath(path): _FailingKG()})
monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FailingKG())
with pytest.raises(ValueError, match="bad input"):
mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
assert calls["count"] == 1, "non-ProgrammingError must not trigger retry"
def test_call_kg_gives_up_after_one_retry(self, monkeypatch):
"""If the second attempt also hits a closed DB, give up rather than
loop forever — a sustained close-stream is a different bug."""
import sqlite3 as _sqlite3
from mempalace import mcp_server
path = "/fake/palace/knowledge_graph.sqlite3"
monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
calls = {"count": 0}
class _AlwaysClosedKG:
def query_entity(self, entity, **kwargs):
calls["count"] += 1
raise _sqlite3.ProgrammingError("closed again")
cache = {}
monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _AlwaysClosedKG())
with pytest.raises(_sqlite3.ProgrammingError):
mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
assert calls["count"] == 2, "expected exactly one retry beyond the initial attempt"