fix(repair): preflight SQLite integrity before rebuild
This commit is contained in:
@@ -330,6 +330,71 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def sqlite_integrity_errors(palace_path: str) -> list[str]:
|
||||||
|
"""Return SQLite quick_check errors for chroma.sqlite3.
|
||||||
|
|
||||||
|
The repair rebuild path eventually calls Chroma's delete_collection().
|
||||||
|
If the SQLite layer has corrupt secondary indexes or FTS5 shadow pages,
|
||||||
|
Chroma can raise an opaque SQLITE_CORRUPT_INDEX / code 779 error before
|
||||||
|
repair reaches the HNSW rebuild.
|
||||||
|
|
||||||
|
Run a direct SQLite quick_check first so repair can fail with a clear,
|
||||||
|
actionable message before invoking Chroma's destructive collection-delete
|
||||||
|
path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
if not os.path.exists(sqlite_path):
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) as conn:
|
||||||
|
rows = conn.execute("PRAGMA quick_check").fetchall()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
return [f"PRAGMA quick_check failed: {e}"]
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
for row in rows:
|
||||||
|
if not row:
|
||||||
|
continue
|
||||||
|
message = str(row[0])
|
||||||
|
if message.lower() != "ok":
|
||||||
|
errors.append(message)
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None:
|
||||||
|
"""Print a clear repair abort message for SQLite-layer corruption."""
|
||||||
|
|
||||||
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
|
preview = errors[:5]
|
||||||
|
|
||||||
|
print("\n ABORT: SQLite-layer corruption detected before repair rebuild.")
|
||||||
|
print(" `mempalace repair` will not call Chroma delete_collection() because")
|
||||||
|
print(" the SQLite database failed `PRAGMA quick_check`.")
|
||||||
|
print()
|
||||||
|
print(f" Database: {sqlite_path}")
|
||||||
|
print()
|
||||||
|
print(" quick_check output:")
|
||||||
|
for message in preview:
|
||||||
|
print(f" - {message}")
|
||||||
|
if len(errors) > len(preview):
|
||||||
|
print(f" ... and {len(errors) - len(preview)} more issue(s)")
|
||||||
|
print()
|
||||||
|
print(" This often means derived SQLite structures, such as secondary indexes")
|
||||||
|
print(" or FTS5 shadow tables, are corrupt while the underlying rows may still")
|
||||||
|
print(" be recoverable.")
|
||||||
|
print()
|
||||||
|
print(" Suggested recovery:")
|
||||||
|
print(" 1. Stop all MemPalace writers / MCP clients.")
|
||||||
|
print(" 2. Back up the entire palace directory.")
|
||||||
|
print(" 3. Recover chroma.sqlite3 offline with sqlite3 `.recover` or `.dump`.")
|
||||||
|
print(" 4. Recreate the FTS5 virtual table from intact embedding_metadata rows.")
|
||||||
|
print(" 5. Verify `PRAGMA integrity_check` returns `ok`.")
|
||||||
|
print(" 6. Re-run `mempalace repair --yes`.")
|
||||||
|
|
||||||
|
|
||||||
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
||||||
"""Rebuild the HNSW index from scratch.
|
"""Rebuild the HNSW index from scratch.
|
||||||
|
|
||||||
@@ -397,6 +462,11 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
|
|||||||
print(e.message)
|
print(e.message)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
sqlite_errors = sqlite_integrity_errors(palace_path)
|
||||||
|
if sqlite_errors:
|
||||||
|
print_sqlite_integrity_abort(palace_path, sqlite_errors)
|
||||||
|
return
|
||||||
|
|
||||||
# Back up ONLY the SQLite database, not the bloated HNSW files
|
# Back up ONLY the SQLite database, not the bloated HNSW files
|
||||||
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
|
||||||
backup_path = sqlite_path + ".backup"
|
backup_path = sqlite_path + ".backup"
|
||||||
|
|||||||
+77
-5
@@ -216,9 +216,11 @@ def test_rebuild_index_empty_palace(mock_backend_cls, mock_shutil, tmp_path):
|
|||||||
@patch("mempalace.repair.shutil")
|
@patch("mempalace.repair.shutil")
|
||||||
@patch("mempalace.repair.ChromaBackend")
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
|
def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
|
||||||
# Create a fake sqlite file
|
# Create a valid sqlite file so the repair preflight can run quick_check.
|
||||||
sqlite_path = tmp_path / "chroma.sqlite3"
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
sqlite_path.write_text("fake")
|
with sqlite3.connect(sqlite_path) as conn:
|
||||||
|
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
mock_col = MagicMock()
|
mock_col = MagicMock()
|
||||||
mock_col.count.return_value = 2
|
mock_col.count.return_value = 2
|
||||||
@@ -234,15 +236,15 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
|
|||||||
|
|
||||||
repair.rebuild_index(palace_path=str(tmp_path))
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
# Verify: backed up sqlite only (not copytree)
|
# Verify: backed up sqlite only, not copytree.
|
||||||
mock_shutil.copy2.assert_called_once()
|
mock_shutil.copy2.assert_called_once()
|
||||||
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
|
assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
|
||||||
|
|
||||||
# Verify: deleted and recreated (cosine is the backend default)
|
# Verify: deleted and recreated.
|
||||||
mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
||||||
mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
|
||||||
|
|
||||||
# Verify: used upsert not add
|
# Verify: used upsert, not add.
|
||||||
mock_new_col.upsert.assert_called_once()
|
mock_new_col.upsert.assert_called_once()
|
||||||
mock_new_col.add.assert_not_called()
|
mock_new_col.add.assert_not_called()
|
||||||
|
|
||||||
@@ -682,3 +684,73 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
|
|||||||
# A backup file is still present — caller can roll back from it.
|
# A backup file is still present — caller can roll back from it.
|
||||||
leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
|
leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
|
||||||
assert leftover
|
assert leftover
|
||||||
|
|
||||||
|
|
||||||
|
def test_sqlite_integrity_errors_returns_empty_for_healthy_db(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
db_path = palace / "chroma.sqlite3"
|
||||||
|
|
||||||
|
with sqlite3.connect(db_path) as conn:
|
||||||
|
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
assert repair.sqlite_integrity_errors(str(palace)) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_sqlite_integrity_errors_reports_unreadable_sqlite_file(tmp_path):
|
||||||
|
palace = tmp_path / "palace"
|
||||||
|
palace.mkdir()
|
||||||
|
db_path = palace / "chroma.sqlite3"
|
||||||
|
db_path.write_bytes(b"not a sqlite database")
|
||||||
|
|
||||||
|
errors = repair.sqlite_integrity_errors(str(palace))
|
||||||
|
|
||||||
|
assert errors
|
||||||
|
assert "quick_check failed" in errors[0]
|
||||||
|
|
||||||
|
|
||||||
|
@patch("mempalace.repair.shutil")
|
||||||
|
@patch("mempalace.repair.ChromaBackend")
|
||||||
|
def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collection(
|
||||||
|
mock_backend_cls,
|
||||||
|
mock_shutil,
|
||||||
|
tmp_path,
|
||||||
|
capsys,
|
||||||
|
):
|
||||||
|
"""Regression for #1362: fail before Chroma delete_collection on sqlite corruption."""
|
||||||
|
|
||||||
|
sqlite_path = tmp_path / "chroma.sqlite3"
|
||||||
|
with sqlite3.connect(sqlite_path) as conn:
|
||||||
|
conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
mock_col = MagicMock()
|
||||||
|
mock_col.count.return_value = 2
|
||||||
|
mock_col.get.return_value = {
|
||||||
|
"ids": ["id1", "id2"],
|
||||||
|
"documents": ["doc1", "doc2"],
|
||||||
|
"metadatas": [{"wing": "a"}, {"wing": "b"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"mempalace.repair.sqlite_integrity_errors",
|
||||||
|
return_value=[
|
||||||
|
"Page 4 of B-tree 12345: database disk image is malformed",
|
||||||
|
"Page 8 of B-tree 67890: database disk image is malformed",
|
||||||
|
],
|
||||||
|
):
|
||||||
|
repair.rebuild_index(palace_path=str(tmp_path))
|
||||||
|
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
|
||||||
|
assert "SQLite-layer corruption detected before repair rebuild" in out
|
||||||
|
assert "PRAGMA quick_check" in out
|
||||||
|
assert "delete_collection" in out
|
||||||
|
assert "Page 4 of B-tree" in out
|
||||||
|
|
||||||
|
mock_backend.delete_collection.assert_not_called()
|
||||||
|
mock_backend.create_collection.assert_not_called()
|
||||||
|
mock_shutil.copy2.assert_not_called()
|
||||||
|
|||||||
Reference in New Issue
Block a user