fix: auto-repair BLOB seq_ids from chromadb 0.6→1.5 migration (#664)
Note from code review: (1) silent exception swallow on migration failure means caller proceeds with potentially corrupt DB — consider returning a boolean or re-raising in a follow-up. (2) No blob length validation before int.from_bytes — malformed rows could produce wrong seq_id values. Both are edge cases; the fix is still valuable for the common chromadb 0.6→1.5 migration path.
This commit is contained in:
@@ -1,11 +1,47 @@
|
|||||||
"""ChromaDB-backed MemPalace collection adapter."""
|
"""ChromaDB-backed MemPalace collection adapter."""
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
|
|
||||||
from .base import BaseCollection
|
from .base import BaseCollection
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_blob_seq_ids(palace_path: str):
|
||||||
|
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
|
||||||
|
|
||||||
|
ChromaDB 0.6.x stored seq_id as big-endian 8-byte BLOBs. ChromaDB 1.5.x
|
||||||
|
expects INTEGER. The auto-migration doesn't convert existing rows, causing
|
||||||
|
the Rust compactor to crash with "mismatched types; Rust type u64 (as SQL
|
||||||
|
type INTEGER) is not compatible with SQL type BLOB".
|
||||||
|
|
||||||
|
Must run BEFORE PersistentClient is created (the compactor fires on init).
|
||||||
|
"""
|
||||||
|
db_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
if not os.path.isfile(db_path):
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
with sqlite3.connect(db_path) as conn:
|
||||||
|
for table in ("embeddings", "max_seq_id"):
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
f"SELECT rowid, seq_id FROM {table} WHERE typeof(seq_id) = 'blob'"
|
||||||
|
).fetchall()
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
continue
|
||||||
|
if not rows:
|
||||||
|
continue
|
||||||
|
updates = [(int.from_bytes(blob, byteorder="big"), rowid) for rowid, blob in rows]
|
||||||
|
conn.executemany(f"UPDATE {table} SET seq_id = ? WHERE rowid = ?", updates)
|
||||||
|
logger.info("Fixed %d BLOB seq_ids in %s", len(updates), table)
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Could not fix BLOB seq_ids in %s", db_path)
|
||||||
|
|
||||||
|
|
||||||
class ChromaCollection(BaseCollection):
|
class ChromaCollection(BaseCollection):
|
||||||
"""Thin adapter over a ChromaDB collection."""
|
"""Thin adapter over a ChromaDB collection."""
|
||||||
@@ -46,6 +82,7 @@ class ChromaBackend:
|
|||||||
except (OSError, NotImplementedError):
|
except (OSError, NotImplementedError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
_fix_blob_seq_ids(palace_path)
|
||||||
client = chromadb.PersistentClient(path=palace_path)
|
client = chromadb.PersistentClient(path=palace_path)
|
||||||
if create:
|
if create:
|
||||||
collection = client.get_or_create_collection(collection_name)
|
collection = client.get_or_create_collection(collection_name)
|
||||||
|
|||||||
+49
-1
@@ -1,7 +1,9 @@
|
|||||||
|
import sqlite3
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from mempalace.backends.chroma import ChromaBackend, ChromaCollection
|
from mempalace.backends.chroma import ChromaBackend, ChromaCollection, _fix_blob_seq_ids
|
||||||
|
|
||||||
|
|
||||||
class _FakeCollection:
|
class _FakeCollection:
|
||||||
@@ -78,3 +80,49 @@ def test_chroma_backend_create_true_creates_directory_and_collection(tmp_path):
|
|||||||
|
|
||||||
client = chromadb.PersistentClient(path=str(palace_path))
|
client = chromadb.PersistentClient(path=str(palace_path))
|
||||||
client.get_collection("mempalace_drawers")
|
client.get_collection("mempalace_drawers")
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
|
||||||
|
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
|
||||||
|
db_path = tmp_path / "chroma.sqlite3"
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
|
||||||
|
conn.execute("CREATE TABLE max_seq_id (rowid INTEGER PRIMARY KEY, seq_id)")
|
||||||
|
# Insert BLOB seq_ids like ChromaDB 0.6.x would
|
||||||
|
blob_42 = (42).to_bytes(8, byteorder="big")
|
||||||
|
blob_99 = (99).to_bytes(8, byteorder="big")
|
||||||
|
conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (blob_42,))
|
||||||
|
conn.execute("INSERT INTO max_seq_id (seq_id) VALUES (?)", (blob_99,))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
_fix_blob_seq_ids(str(tmp_path))
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
|
||||||
|
assert row == (42, "integer")
|
||||||
|
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM max_seq_id").fetchone()
|
||||||
|
assert row == (99, "integer")
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
|
||||||
|
"""No error when seq_ids are already integers."""
|
||||||
|
db_path = tmp_path / "chroma.sqlite3"
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
|
||||||
|
conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
_fix_blob_seq_ids(str(tmp_path))
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
|
||||||
|
assert row == (42, "integer")
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_blob_seq_ids_noop_without_database(tmp_path):
|
||||||
|
"""No error when palace has no chroma.sqlite3."""
|
||||||
|
_fix_blob_seq_ids(str(tmp_path)) # should not raise
|
||||||
|
|||||||
Reference in New Issue
Block a user