refactor(backends): typed QueryResult/GetResult, PalaceRef, BaseBackend registry (RFC 001 §10)

Advances RFC 001 §10 cleanup so backend-author PRs (#574 LanceDB, #665 Postgres,
#700 Qdrant, #697 hosted, #643 PalaceStore, #381 Qdrant) have a stable target
to align against.

Scope (this PR):

- Typed QueryResult / GetResult dataclasses replace Chroma's dict shape at
  the BaseCollection boundary (§1.3). A transitional _DictCompatMixin keeps
  existing callers working while the attribute-access migration proceeds.
- BaseCollection is now kwargs-only across add/upsert/query/get/delete/update
  with ABC defaults for estimated_count/close/health and a non-atomic default
  update() (§1.1–1.2).
- PalaceRef replaces raw path strings at the backend boundary (§2.2).
- BaseBackend ABC with get_collection/close_palace/close/health/detect (§2.3).
- mempalace.backends entry-point group + in-tree registry with
  resolve_backend_for_palace priority order matching §3.2–3.3.
- ChromaCollection normalizes chroma returns into typed results; unknown
  where-clause operators raise UnsupportedFilterError (no silent drop, §1.4).
- ChromaBackend absorbs the inode/mtime client-cache freshness check
  previously duplicated in mcp_server._get_client() (§10 + PR #757).
- searcher.py migrated to typed-attribute access as the reference call
  site; remaining callers land in a follow-up.
- pyproject: chroma registered via [project.entry-points."mempalace.backends"].

Out of scope (explicit follow-ups):

- Full caller migration off the dict-compat shim across palace.py,
  mcp_server.py, miner.py, convo_miner.py, dedup.py, repair.py, exporter.py,
  palace_graph.py, cli.py, closet_llm.py.
- Embedder injection + three-state EmbedderIdentityMismatchError check (§1.5).
- maintenance_state() / run_maintenance() benchmark hooks (§7.3).
- AbstractBackendContractSuite full coverage (§7.1–7.2).
- mempalace migrate / mempalace verify CLI rewrites through BaseCollection (§8).

Tests: 970 passed (up from 967 on develop); new coverage for typed results,
empty-result outer-shape preservation, \$regex rejection, registry lookup,
priority resolver, and PalaceRef-kwarg ChromaBackend.get_collection.

Refs: #743 (RFC 001), #989 (RFC 002 tracking issue).
This commit is contained in:
Igor Lins e Silva
2026-04-18 12:45:16 -03:00
parent e4a2cd48a2
commit a17a8b734a
7 changed files with 1143 additions and 94 deletions
+408 -37
View File
@@ -1,17 +1,54 @@
"""ChromaDB-backed MemPalace collection adapter."""
"""ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
import logging
import os
import sqlite3
from typing import Any, Optional
import chromadb
from .base import BaseCollection
from .base import (
BaseBackend,
BaseCollection,
GetResult,
HealthStatus,
PalaceNotFoundError,
PalaceRef,
QueryResult,
UnsupportedFilterError,
_IncludeSpec,
)
logger = logging.getLogger(__name__)
def _fix_blob_seq_ids(palace_path: str):
_REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$contains"})
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
def _validate_where(where: Optional[dict]) -> None:
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
Spec (RFC 001 §1.4): silent dropping of unknown operators is forbidden.
"""
if not where:
return
stack = [where]
while stack:
node = stack.pop()
if not isinstance(node, dict):
continue
for k, v in node.items():
if k.startswith("$") and k not in _SUPPORTED_OPERATORS:
raise UnsupportedFilterError(f"operator {k!r} not supported by chroma backend")
if isinstance(v, dict):
stack.append(v)
elif isinstance(v, list):
stack.extend(x for x in v if isinstance(x, dict))
def _fix_blob_seq_ids(palace_path: str) -> None:
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
ChromaDB 0.6.x stored seq_id as big-endian 8-byte BLOBs. ChromaDB 1.5.x
@@ -43,62 +80,293 @@ def _fix_blob_seq_ids(palace_path: str):
logger.exception("Could not fix BLOB seq_ids in %s", db_path)
# ---------------------------------------------------------------------------
# Collection adapter
# ---------------------------------------------------------------------------
def _as_list(v: Any) -> list:
"""Coerce possibly-None scalar-or-list into a list (defensive for chroma nulls)."""
if v is None:
return []
if isinstance(v, list):
return v
return [v]
class ChromaCollection(BaseCollection):
"""Thin adapter over a ChromaDB collection."""
"""Thin adapter translating ChromaDB dict returns into typed results."""
def __init__(self, collection):
self._collection = collection
def add(self, *, documents, ids, metadatas=None):
self._collection.add(documents=documents, ids=ids, metadatas=metadatas)
# ------------------------------------------------------------------
# Writes
# ------------------------------------------------------------------
def upsert(self, *, documents, ids, metadatas=None):
self._collection.upsert(documents=documents, ids=ids, metadatas=metadatas)
def add(self, *, documents, ids, metadatas=None, embeddings=None):
kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
if metadatas is not None:
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.add(**kwargs)
def update(self, **kwargs):
def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
if metadatas is not None:
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.upsert(**kwargs)
def update(
self,
*,
ids,
documents=None,
metadatas=None,
embeddings=None,
):
if documents is None and metadatas is None and embeddings is None:
raise ValueError("update requires at least one of documents, metadatas, embeddings")
kwargs: dict[str, Any] = {"ids": ids}
if documents is not None:
kwargs["documents"] = documents
if metadatas is not None:
kwargs["metadatas"] = metadatas
if embeddings is not None:
kwargs["embeddings"] = embeddings
self._collection.update(**kwargs)
def query(self, **kwargs):
return self._collection.query(**kwargs)
# ------------------------------------------------------------------
# Reads
# ------------------------------------------------------------------
def get(self, **kwargs):
return self._collection.get(**kwargs)
def query(
self,
*,
query_texts=None,
query_embeddings=None,
n_results=10,
where=None,
where_document=None,
include=None,
) -> QueryResult:
_validate_where(where)
_validate_where(where_document)
def delete(self, **kwargs):
spec = _IncludeSpec.resolve(include, default_distances=True)
chroma_include: list[str] = []
if spec.documents:
chroma_include.append("documents")
if spec.metadatas:
chroma_include.append("metadatas")
if spec.distances:
chroma_include.append("distances")
if spec.embeddings:
chroma_include.append("embeddings")
kwargs: dict[str, Any] = {
"n_results": n_results,
"include": chroma_include,
}
if query_texts is not None:
kwargs["query_texts"] = query_texts
if query_embeddings is not None:
kwargs["query_embeddings"] = query_embeddings
if where is not None:
kwargs["where"] = where
if where_document is not None:
kwargs["where_document"] = where_document
raw = self._collection.query(**kwargs)
num_queries = (
len(query_texts)
if query_texts is not None
else (len(query_embeddings) if query_embeddings is not None else 1)
)
ids = raw.get("ids") or []
if not ids:
return QueryResult.empty(num_queries=num_queries)
documents = raw.get("documents") or [[] for _ in ids]
metadatas = raw.get("metadatas") or [[] for _ in ids]
distances = raw.get("distances") or [[] for _ in ids]
embeddings_raw = raw.get("embeddings") if spec.embeddings else None
def _none_list_to_empty(outer):
return [(inner or []) for inner in outer]
return QueryResult(
ids=_none_list_to_empty(ids),
documents=_none_list_to_empty(documents),
metadatas=_none_list_to_empty(metadatas),
distances=_none_list_to_empty(distances),
embeddings=(
[list(inner) for inner in embeddings_raw]
if spec.embeddings and embeddings_raw is not None
else None
),
)
def get(
self,
*,
ids=None,
where=None,
where_document=None,
limit=None,
offset=None,
include=None,
) -> GetResult:
_validate_where(where)
_validate_where(where_document)
spec = _IncludeSpec.resolve(include, default_distances=False)
chroma_include: list[str] = []
if spec.documents:
chroma_include.append("documents")
if spec.metadatas:
chroma_include.append("metadatas")
if spec.embeddings:
chroma_include.append("embeddings")
kwargs: dict[str, Any] = {"include": chroma_include}
if ids is not None:
kwargs["ids"] = ids
if where is not None:
kwargs["where"] = where
if where_document is not None:
kwargs["where_document"] = where_document
if limit is not None:
kwargs["limit"] = limit
if offset is not None:
kwargs["offset"] = offset
raw = self._collection.get(**kwargs)
out_ids = list(raw.get("ids") or [])
out_docs = list(raw.get("documents") or []) if spec.documents else []
out_metas = list(raw.get("metadatas") or []) if spec.metadatas else []
out_embeds = raw.get("embeddings") if spec.embeddings else None
# Pad doc/meta lists to match ids so downstream zipping is safe.
if spec.documents and len(out_docs) < len(out_ids):
out_docs = out_docs + [""] * (len(out_ids) - len(out_docs))
if spec.metadatas and len(out_metas) < len(out_ids):
out_metas = out_metas + [{}] * (len(out_ids) - len(out_metas))
return GetResult(
ids=out_ids,
documents=out_docs,
metadatas=out_metas,
embeddings=[list(v) for v in out_embeds] if out_embeds is not None else None,
)
def delete(self, *, ids=None, where=None):
_validate_where(where)
kwargs: dict[str, Any] = {}
if ids is not None:
kwargs["ids"] = ids
if where is not None:
kwargs["where"] = where
self._collection.delete(**kwargs)
def count(self):
return self._collection.count()
class ChromaBackend:
"""Factory for MemPalace's default ChromaDB backend."""
# ---------------------------------------------------------------------------
# Backend
# ---------------------------------------------------------------------------
class ChromaBackend(BaseBackend):
"""MemPalace's default ChromaDB backend.
Maintains two caches:
* ``self._clients`` — ``palace_path -> PersistentClient`` for callers
using the ``PalaceRef`` / :meth:`get_collection` path.
* An inode+mtime freshness check absorbed from ``mcp_server._get_client``
(merged via #757) ensuring a palace rebuild on disk is detected on the
next :meth:`get_collection` call.
"""
name = "chroma"
capabilities = frozenset(
{
"supports_embeddings_in",
"supports_embeddings_passthrough",
"supports_embeddings_out",
"supports_metadata_filters",
"supports_contains_fast",
"local_mode",
}
)
def __init__(self):
# Per-instance client cache: palace_path -> chromadb.PersistentClient
self._clients: dict = {}
# palace_path -> PersistentClient
self._clients: dict[str, Any] = {}
# palace_path -> (inode, mtime) of chroma.sqlite3 at cache time.
self._freshness: dict[str, tuple[int, float]] = {}
self._closed = False
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@staticmethod
def _db_stat(palace_path: str) -> tuple[int, float]:
"""Return ``(inode, mtime)`` of ``chroma.sqlite3`` or ``(0, 0.0)`` if absent."""
db_path = os.path.join(palace_path, "chroma.sqlite3")
try:
st = os.stat(db_path)
return (st.st_ino, st.st_mtime)
except OSError:
return (0, 0.0)
def _client(self, palace_path: str):
"""Return a cached PersistentClient for *palace_path*, creating one if needed."""
if palace_path not in self._clients:
"""Return a cached ``PersistentClient``, rebuilding on inode/mtime change.
Handles the palace-rebuild case (repair/nuke/purge) by invalidating the
cache when ``chroma.sqlite3`` changes on disk. FAT/exFAT return inode 0,
so inode comparisons only fire when non-zero (matches #757 semantics).
"""
if self._closed:
from .base import BackendClosedError # late import avoids cycles at module load
raise BackendClosedError("ChromaBackend has been closed")
cached = self._clients.get(palace_path)
cached_inode, cached_mtime = self._freshness.get(palace_path, (0, 0.0))
current_inode, current_mtime = self._db_stat(palace_path)
inode_changed = current_inode != 0 and cached_inode != 0 and current_inode != cached_inode
mtime_changed = (
current_mtime != 0.0 and cached_mtime != 0.0 and current_mtime > cached_mtime
)
if cached is None or inode_changed or mtime_changed:
_fix_blob_seq_ids(palace_path)
self._clients[palace_path] = chromadb.PersistentClient(path=palace_path)
return self._clients[palace_path]
cached = chromadb.PersistentClient(path=palace_path)
self._clients[palace_path] = cached
self._freshness[palace_path] = (current_inode, current_mtime)
return cached
# ------------------------------------------------------------------
# Public static helpers (for callers that manage their own caching)
# Public static helpers (legacy; prefer :meth:`get_collection`)
# ------------------------------------------------------------------
@staticmethod
def make_client(palace_path: str):
"""Create and return a fresh PersistentClient (fix BLOB seq_ids first).
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
Intended for long-lived callers (e.g. mcp_server) that keep their own
inode/mtime-based client cache.
Deprecated-ish: exposed for legacy long-lived callers that manage their
own client cache. New code should obtain a collection through
:meth:`get_collection` which manages caching internally.
"""
_fix_blob_seq_ids(palace_path)
return chromadb.PersistentClient(path=palace_path)
@@ -109,12 +377,31 @@ class ChromaBackend:
return chromadb.__version__
# ------------------------------------------------------------------
# Collection lifecycle
# BaseBackend surface
# ------------------------------------------------------------------
def get_collection(self, palace_path: str, collection_name: str, create: bool = False):
def get_collection(
self,
*args,
**kwargs,
) -> ChromaCollection:
"""Obtain a collection for a palace.
Supports two calling conventions during the RFC 001 transition:
* New (preferred): ``get_collection(palace=PalaceRef, collection_name=...,
create=False, options=None)``.
* Legacy: ``get_collection(palace_path, collection_name, create=False)``
— still used by callers not yet migrated.
"""
palace_ref, collection_name, create, options = _normalize_get_collection_args(args, kwargs)
palace_path = palace_ref.local_path
if palace_path is None:
raise PalaceNotFoundError("ChromaBackend requires PalaceRef.local_path")
if not create and not os.path.isdir(palace_path):
raise FileNotFoundError(palace_path)
raise PalaceNotFoundError(palace_path)
if create:
os.makedirs(palace_path, exist_ok=True)
@@ -124,29 +411,113 @@ class ChromaBackend:
pass
client = self._client(palace_path)
hnsw_space = "cosine"
if options and isinstance(options, dict):
hnsw_space = options.get("hnsw_space", hnsw_space)
if create:
collection = client.get_or_create_collection(
collection_name, metadata={"hnsw:space": "cosine"}
collection_name, metadata={"hnsw:space": hnsw_space}
)
else:
collection = client.get_collection(collection_name)
return ChromaCollection(collection)
def get_or_create_collection(
self, palace_path: str, collection_name: str
) -> "ChromaCollection":
"""Shorthand for get_collection(..., create=True)."""
def close_palace(self, palace) -> None:
"""Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
path = palace.local_path if isinstance(palace, PalaceRef) else palace
if path is None:
return
self._clients.pop(path, None)
self._freshness.pop(path, None)
def close(self) -> None:
self._clients.clear()
self._freshness.clear()
self._closed = True
def health(self, palace: Optional[PalaceRef] = None) -> HealthStatus:
if self._closed:
return HealthStatus.unhealthy("backend closed")
return HealthStatus.healthy()
@classmethod
def detect(cls, path: str) -> bool:
return os.path.isfile(os.path.join(path, "chroma.sqlite3"))
# ------------------------------------------------------------------
# Legacy (pre-RFC 001) surface — retained while callers migrate.
# ------------------------------------------------------------------
def get_or_create_collection(self, palace_path: str, collection_name: str) -> ChromaCollection:
"""Legacy shim for ``get_collection(..., create=True)`` by path string."""
return self.get_collection(palace_path, collection_name, create=True)
def delete_collection(self, palace_path: str, collection_name: str) -> None:
"""Delete *collection_name* from the palace at *palace_path*."""
"""Delete ``collection_name`` from the palace at ``palace_path``."""
self._client(palace_path).delete_collection(collection_name)
def create_collection(
self, palace_path: str, collection_name: str, hnsw_space: str = "cosine"
) -> "ChromaCollection":
"""Create (not get-or-create) *collection_name* with cosine HNSW space."""
) -> ChromaCollection:
"""Create (not get-or-create) ``collection_name`` with the given HNSW space."""
collection = self._client(palace_path).create_collection(
collection_name, metadata={"hnsw:space": hnsw_space}
)
return ChromaCollection(collection)
def _normalize_get_collection_args(args, kwargs):
"""Unify legacy positional ``(palace_path, collection_name, create)`` calls
with the new kwargs-only ``(palace=PalaceRef, collection_name=..., create=...)``.
Returns ``(PalaceRef, collection_name, create, options)``.
"""
# New-style: palace= kwarg with a PalaceRef (spec path).
if "palace" in kwargs:
palace_ref = kwargs.pop("palace")
if not isinstance(palace_ref, PalaceRef):
raise TypeError("palace= must be a PalaceRef instance")
collection_name = kwargs.pop("collection_name")
create = kwargs.pop("create", False)
options = kwargs.pop("options", None)
if kwargs:
raise TypeError(f"unexpected kwargs: {sorted(kwargs)}")
if args:
raise TypeError("positional args not allowed with palace= kwarg")
return palace_ref, collection_name, create, options
# Legacy: first positional is a path string.
if args:
palace_path = args[0]
rest = list(args[1:])
collection_name = kwargs.pop("collection_name", None) or (rest.pop(0) if rest else None)
if collection_name is None:
raise TypeError("collection_name is required")
create = kwargs.pop("create", False)
if rest:
create = rest.pop(0)
if kwargs:
raise TypeError(f"unexpected kwargs: {sorted(kwargs)}")
return (
PalaceRef(id=palace_path, local_path=palace_path),
collection_name,
bool(create),
None,
)
# Legacy kwargs-only (palace_path=..., collection_name=..., create=...)
if "palace_path" in kwargs:
palace_path = kwargs.pop("palace_path")
collection_name = kwargs.pop("collection_name")
create = kwargs.pop("create", False)
if kwargs:
raise TypeError(f"unexpected kwargs: {sorted(kwargs)}")
return (
PalaceRef(id=palace_path, local_path=palace_path),
collection_name,
bool(create),
None,
)
raise TypeError("get_collection requires palace= or a positional palace_path")