fix(repair): run SQLite integrity preflight before chromadb open

#1364 added the SQLite quick_check preflight to rebuild_index, but
placed it AFTER backend.get_collection(...). On a SQLite-corrupt
palace, chromadb's rust binding raises pyo3_runtime.PanicException —
which is not a regular Exception subclass — so it propagates past the
existing `except Exception` handlers and the user sees a 30-line stack
trace instead of the friendly abort message #1364 was designed to
deliver. Reproduced with `mempalace repair --yes` against a palace
whose chroma.sqlite3 has 4 mangled pages: pre-fix, panic; post-fix,
the clean abort message and exit code 1.

Two changes:

- mempalace/cli.py cmd_repair: run sqlite_integrity_errors() right
  after the basic palace-existence check, BEFORE the max_seq_id
  preflight (which itself opens sqlite3) and BEFORE backend =
  ChromaBackend(). Exit non-zero so unattended scripts and CI gates
  see the failure.

- mempalace/repair.py rebuild_index: same move at the function level
  for direct callers (tests, MCP) that bypass cmd_repair.

The new test test_rebuild_index_runs_sqlite_preflight_before_chromadb_open
uses a real chromadb-built palace (no ChromaBackend mock) plus a
real corrupt SQLite (16 KB of mangled pages) so the ordering is
exercised end-to-end. The previously-shipping test for the abort path
mocked both the backend and sqlite_integrity_errors, which is why the
ordering bug shipped CI-green.

Six existing test_cli.py cmd_repair tests used `(palace_dir /
"chroma.sqlite3").write_text("db")` to fake the SQLite file. The new
preflight correctly fails quick_check on those 2-byte stubs, so the
tests now create empty real SQLite DBs the same way the test_repair.py
fixtures already do.
This commit is contained in:
Igor Lins e Silva
2026-05-07 11:52:58 -03:00
parent f38d9eb109
commit 5134a635ed
4 changed files with 83 additions and 11 deletions
+14
View File
@@ -662,6 +662,8 @@ def cmd_repair(args):
_rebuild_collection_via_temp,
check_extraction_safety,
maybe_repair_poisoned_max_seq_id_before_rebuild,
print_sqlite_integrity_abort,
sqlite_integrity_errors,
)
config = MempalaceConfig()
@@ -743,6 +745,18 @@ def cmd_repair(args):
print(f"\n No palace database found at {db_path}")
return
# Run the SQLite integrity preflight before any chromadb client open.
# ChromaDB's rust binding raises pyo3_runtime.PanicException on a
# malformed page, which is not a regular Exception subclass and
# propagates past the try/except below — the user gets a 30-line
# stack trace instead of the friendly abort message. Run quick_check
# here so we can surface the clear recovery instructions and exit
# cleanly before chromadb's compactor touches the disk.
sqlite_errors = sqlite_integrity_errors(palace_path)
if sqlite_errors:
print_sqlite_integrity_abort(palace_path, sqlite_errors)
sys.exit(1)
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path,
backup=getattr(args, "backup", True),
+11 -5
View File
@@ -633,6 +633,17 @@ def rebuild_index(
print(f"{'=' * 55}\n")
print(f" Palace: {palace_path}")
# Run the SQLite integrity preflight before any chromadb client open.
# ChromaDB's rust binding raises pyo3_runtime.PanicException (which is
# not a regular Exception subclass) on a malformed page, propagating
# past the try/except around get_collection below. Catching the
# corruption here lets us surface the clear recovery instructions and
# exit cleanly before chromadb's compactor touches the disk.
sqlite_errors = sqlite_integrity_errors(palace_path)
if sqlite_errors:
print_sqlite_integrity_abort(palace_path, sqlite_errors)
return
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path,
assume_yes=True,
@@ -676,11 +687,6 @@ def rebuild_index(
print(e.message)
return
sqlite_errors = sqlite_integrity_errors(palace_path)
if sqlite_errors:
print_sqlite_integrity_abort(palace_path, sqlite_errors)
return
# Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
backup_path = sqlite_path + ".backup"
+7 -6
View File
@@ -2,6 +2,7 @@
import argparse
import shlex
import sqlite3
import sys
from pathlib import Path
from unittest.mock import MagicMock, call, patch
@@ -774,7 +775,7 @@ def test_cmd_repair_requires_palace_database(mock_config_cls, tmp_path, capsys):
def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "mempalace_drawers"
args = argparse.Namespace(palace=None)
@@ -790,7 +791,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "mempalace_drawers"
args = argparse.Namespace(palace=None)
@@ -807,7 +808,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "mempalace_drawers"
args = argparse.Namespace(palace=None, yes=True)
@@ -843,7 +844,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "custom_drawers"
args = argparse.Namespace(palace=None, yes=True)
@@ -882,7 +883,7 @@ def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys
def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "mempalace_drawers"
args = argparse.Namespace(palace=None, yes=True)
@@ -916,7 +917,7 @@ def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp
def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsys):
palace_dir = tmp_path / "palace"
palace_dir.mkdir()
(palace_dir / "chroma.sqlite3").write_text("db")
sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
mock_config_cls.return_value.palace_path = str(palace_dir)
mock_config_cls.return_value.collection_name = "mempalace_drawers"
args = argparse.Namespace(palace=None)
+51
View File
@@ -1153,6 +1153,57 @@ def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collectio
mock_shutil.copy2.assert_not_called()
def test_rebuild_index_runs_sqlite_preflight_before_chromadb_open(tmp_path, capsys):
"""The SQLite integrity preflight must run BEFORE backend.get_collection.
chromadb's rust binding raises pyo3_runtime.PanicException (which is not
a regular Exception subclass) on a malformed page, so any get_collection
call against a corrupt SQLite propagates past `except Exception` handlers
and produces a 30-line stack trace instead of the friendly abort message.
Regression test for the ordering bug where the preflight was placed after
the chromadb client open and therefore never reached on the cases it was
designed to catch (#1364 follow-up).
"""
palace = tmp_path / "palace"
palace.mkdir()
# Build a real chromadb palace with one drawer so chroma.sqlite3 exists
# at full schema size, then mangle several middle pages so PRAGMA
# quick_check fails with "disk image is malformed". This matches the
# production failure mode users hit in #1362 / #1364.
from mempalace.backends.chroma import ChromaBackend
backend = ChromaBackend()
try:
col = backend.create_collection(str(palace), "mempalace_drawers")
col.upsert(
ids=["d1"],
documents=["doc"],
metadatas=[{"wing": "w", "room": "r"}],
)
finally:
backend.close()
sqlite_path = palace / "chroma.sqlite3"
pre_size = sqlite_path.stat().st_size
assert pre_size > 16384, "need a multi-page sqlite db to mangle"
with open(sqlite_path, "r+b") as f:
f.seek(40960) # page 10
f.write(b"\xde\xad\xbe\xef" * 4096) # 16 KB of garbage
# No chromadb mocks: rebuild_index must reach sqlite_integrity_errors
# before any code path that opens a chromadb client. If the preflight
# comes too late, the test fails with pyo3_runtime.PanicException
# instead of returning cleanly.
repair.rebuild_index(palace_path=str(palace))
out = capsys.readouterr().out
assert "SQLite-layer corruption detected before repair rebuild" in out
assert "PRAGMA quick_check" in out
assert "disk image is malformed" in out
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
"""#1295: default repair preflight must not drop queued writes."""