Merge pull request #1191 from funguf/fix/hnsw-index-bloat-rebased
fix: prevent HNSW index bloat from resize+persist cycles
This commit is contained in:
@@ -28,6 +28,31 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
|
||||
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
|
||||
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
|
||||
|
||||
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
|
||||
#
|
||||
# With default params (batch_size=100, sync_threshold=1000, initial capacity
|
||||
# 1000), inserting tens of thousands of drawers triggers ~30 index resizes
|
||||
# and hundreds of persistDirty() calls. persistDirty uses relative seek
|
||||
# positioning in link_lists.bin; accumulated seek drift across resize cycles
|
||||
# causes the OS to extend the sparse file with zero-filled regions, each
|
||||
# cycle compounding the next. Result: link_lists.bin grows into hundreds of
|
||||
# GB sparse, after which `status`/`search`/`repair` segfault.
|
||||
#
|
||||
# Setting large batch and sync thresholds at collection creation defers
|
||||
# persistence until a single large batch completes, breaking the resize+
|
||||
# persist feedback loop. Empirically validated on a 39,792-drawer rebuild
|
||||
# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04.
|
||||
#
|
||||
# Note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw":
|
||||
# {"batch_size": ..., "sync_threshold": ...}})` retrofit path for already-
|
||||
# created collections (`UpdateHNSWConfiguration` in chromadb's API), but
|
||||
# this PR doesn't pursue that — once link_lists.bin has bloated, the index
|
||||
# is already corrupt and the only known recovery is a fresh mine.
|
||||
_HNSW_BLOAT_GUARD = {
|
||||
"hnsw:batch_size": 50_000,
|
||||
"hnsw:sync_threshold": 50_000,
|
||||
}
|
||||
|
||||
|
||||
def _validate_where(where: Optional[dict]) -> None:
|
||||
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
|
||||
@@ -1014,7 +1039,11 @@ class ChromaBackend(BaseBackend):
|
||||
if create:
|
||||
collection = client.get_or_create_collection(
|
||||
collection_name,
|
||||
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
|
||||
metadata={
|
||||
"hnsw:space": hnsw_space,
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
else:
|
||||
@@ -1064,7 +1093,11 @@ class ChromaBackend(BaseBackend):
|
||||
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
|
||||
collection = self._client(palace_path).create_collection(
|
||||
collection_name,
|
||||
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
|
||||
metadata={
|
||||
"hnsw:space": hnsw_space,
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
**ef_kwargs,
|
||||
)
|
||||
return ChromaCollection(collection)
|
||||
|
||||
@@ -60,6 +60,7 @@ from .version import __version__ # noqa: E402
|
||||
from .backends.chroma import ( # noqa: E402
|
||||
ChromaBackend,
|
||||
ChromaCollection,
|
||||
_HNSW_BLOAT_GUARD,
|
||||
_pin_hnsw_threads,
|
||||
hnsw_capacity_status,
|
||||
)
|
||||
@@ -285,7 +286,11 @@ def _get_collection(create=False):
|
||||
# so the retrofit runs every time _get_collection opens a cache).
|
||||
raw = client.get_or_create_collection(
|
||||
_config.collection_name,
|
||||
metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:num_threads": 1,
|
||||
**_HNSW_BLOAT_GUARD,
|
||||
},
|
||||
)
|
||||
_pin_hnsw_threads(raw)
|
||||
_collection_cache = ChromaCollection(raw)
|
||||
|
||||
@@ -336,6 +336,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
|
||||
assert col.metadata.get("hnsw:space") == "cosine"
|
||||
|
||||
|
||||
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
|
||||
"""The HNSW guard from #344 must land on freshly-created collection metadata.
|
||||
|
||||
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
|
||||
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
|
||||
and segfaults `status` / `search` / `repair`. The guard belongs at
|
||||
collection-creation time so every fresh palace gets it without needing
|
||||
a runtime retrofit. Asserting both keys land on the persisted metadata
|
||||
also covers the #1161 "config silently dropped" concern at CI time.
|
||||
"""
|
||||
palace_path = tmp_path / "palace"
|
||||
|
||||
ChromaBackend().get_collection(
|
||||
str(palace_path),
|
||||
collection_name="mempalace_drawers",
|
||||
create=True,
|
||||
)
|
||||
|
||||
client = chromadb.PersistentClient(path=str(palace_path))
|
||||
col = client.get_collection("mempalace_drawers")
|
||||
assert col.metadata.get("hnsw:batch_size") == 50_000
|
||||
assert col.metadata.get("hnsw:sync_threshold") == 50_000
|
||||
|
||||
|
||||
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
|
||||
"""Same guard must apply via the legacy create_collection() path."""
|
||||
palace_path = tmp_path / "palace"
|
||||
|
||||
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
|
||||
|
||||
client = chromadb.PersistentClient(path=str(palace_path))
|
||||
col = client.get_collection("mempalace_drawers")
|
||||
assert col.metadata.get("hnsw:batch_size") == 50_000
|
||||
assert col.metadata.get("hnsw:sync_threshold") == 50_000
|
||||
|
||||
|
||||
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
|
||||
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
|
||||
db_path = tmp_path / "chroma.sqlite3"
|
||||
|
||||
Reference in New Issue
Block a user