diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index ad7748f..e541f3b 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -28,6 +28,31 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c _OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"}) _SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS +# HNSW tuning to prevent link_lists.bin bloat on large mines (#344). +# +# With default params (batch_size=100, sync_threshold=1000, initial capacity +# 1000), inserting tens of thousands of drawers triggers ~30 index resizes +# and hundreds of persistDirty() calls. persistDirty uses relative seek +# positioning in link_lists.bin; accumulated seek drift across resize cycles +# causes the OS to extend the sparse file with zero-filled regions, each +# cycle compounding the next. Result: link_lists.bin grows into hundreds of +# GB sparse, after which `status`/`search`/`repair` segfault. +# +# Setting large batch and sync thresholds at collection creation defers +# persistence until a single large batch completes, breaking the resize+ +# persist feedback loop. Empirically validated on a 39,792-drawer rebuild +# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04. +# +# Note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw": +# {"batch_size": ..., "sync_threshold": ...}})` retrofit path for already- +# created collections (`UpdateHNSWConfiguration` in chromadb's API), but +# this PR doesn't pursue that — once link_lists.bin has bloated, the index +# is already corrupt and the only known recovery is a fresh mine. +_HNSW_BLOAT_GUARD = { + "hnsw:batch_size": 50_000, + "hnsw:sync_threshold": 50_000, +} + def _validate_where(where: Optional[dict]) -> None: """Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``. @@ -992,7 +1017,11 @@ class ChromaBackend(BaseBackend): if create: collection = client.get_or_create_collection( collection_name, - metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1}, + metadata={ + "hnsw:space": hnsw_space, + "hnsw:num_threads": 1, + **_HNSW_BLOAT_GUARD, + }, **ef_kwargs, ) else: @@ -1042,7 +1071,11 @@ class ChromaBackend(BaseBackend): ef_kwargs = {"embedding_function": ef} if ef is not None else {} collection = self._client(palace_path).create_collection( collection_name, - metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1}, + metadata={ + "hnsw:space": hnsw_space, + "hnsw:num_threads": 1, + **_HNSW_BLOAT_GUARD, + }, **ef_kwargs, ) return ChromaCollection(collection) diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py index 9cc454e..43897c8 100644 --- a/mempalace/mcp_server.py +++ b/mempalace/mcp_server.py @@ -60,6 +60,7 @@ from .version import __version__ # noqa: E402 from .backends.chroma import ( # noqa: E402 ChromaBackend, ChromaCollection, + _HNSW_BLOAT_GUARD, _pin_hnsw_threads, hnsw_capacity_status, ) @@ -285,7 +286,11 @@ def _get_collection(create=False): # so the retrofit runs every time _get_collection opens a cache). raw = client.get_or_create_collection( _config.collection_name, - metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1}, + metadata={ + "hnsw:space": "cosine", + "hnsw:num_threads": 1, + **_HNSW_BLOAT_GUARD, + }, ) _pin_hnsw_threads(raw) _collection_cache = ChromaCollection(raw) diff --git a/tests/test_backends.py b/tests/test_backends.py index 9fe5ca1..1b4e069 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -336,6 +336,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path): assert col.metadata.get("hnsw:space") == "cosine" +def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path): + """The HNSW guard from #344 must land on freshly-created collection metadata. + + Without batch_size + sync_threshold, mining ~10K+ drawers triggers the + resize+persist drift that bloats link_lists.bin into hundreds of GB sparse + and segfaults `status` / `search` / `repair`. The guard belongs at + collection-creation time so every fresh palace gets it without needing + a runtime retrofit. Asserting both keys land on the persisted metadata + also covers the #1161 "config silently dropped" concern at CI time. + """ + palace_path = tmp_path / "palace" + + ChromaBackend().get_collection( + str(palace_path), + collection_name="mempalace_drawers", + create=True, + ) + + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.get_collection("mempalace_drawers") + assert col.metadata.get("hnsw:batch_size") == 50_000 + assert col.metadata.get("hnsw:sync_threshold") == 50_000 + + +def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path): + """Same guard must apply via the legacy create_collection() path.""" + palace_path = tmp_path / "palace" + + ChromaBackend().create_collection(str(palace_path), "mempalace_drawers") + + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.get_collection("mempalace_drawers") + assert col.metadata.get("hnsw:batch_size") == 50_000 + assert col.metadata.get("hnsw:sync_threshold") == 50_000 + + def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path): """Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair.""" db_path = tmp_path / "chroma.sqlite3"