merge: develop into fix/1362-repair-sqlite-integrity-preflight (round 2)
#1357 (max_seq_id preflight) merged into develop while this branch was in CI, opening a fresh conflict between the two preflight helpers. mempalace/repair.py: - Kept both: this branch's sqlite_integrity_errors() / print_sqlite_ integrity_abort() AND develop's maybe_repair_poisoned_max_seq_id_ before_rebuild() from #1357. They check for distinct corruption classes and run as separate preflights. tests/test_repair.py: - Kept both this branch's sqlite_integrity_errors test group and develop's max_seq_id preflight test group; non-overlapping coverage. Local: 1623 tests pass, ruff lint+format clean against 0.4.x CI pin.
This commit is contained in:
+51
-3
@@ -18,8 +18,10 @@ from mempalace.backends import (
|
||||
from mempalace.backends.chroma import (
|
||||
ChromaBackend,
|
||||
ChromaCollection,
|
||||
_HNSW_MISSING_METADATA_DATA_FLOOR,
|
||||
_fix_blob_seq_ids,
|
||||
_pin_hnsw_threads,
|
||||
_segment_appears_healthy,
|
||||
quarantine_invalid_hnsw_metadata,
|
||||
quarantine_stale_hnsw,
|
||||
)
|
||||
@@ -685,9 +687,9 @@ def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path)
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
|
||||
"""Segment with no metadata file is treated as fresh / never-flushed
|
||||
and not quarantined — renaming an empty dir orphans nothing."""
|
||||
def test_quarantine_stale_hnsw_leaves_empty_segment_without_metadata_alone(tmp_path):
|
||||
"""Missing metadata is okay only when the segment has no meaningful data yet."""
|
||||
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
@@ -695,11 +697,57 @@ def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=None,
|
||||
)
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
|
||||
assert moved == []
|
||||
assert seg.exists()
|
||||
|
||||
|
||||
def test_segment_without_metadata_but_with_nontrivial_data_is_unhealthy(tmp_path):
|
||||
"""Data without index_metadata.pickle is a partial flush, not a fresh segment."""
|
||||
|
||||
seg = tmp_path / "abcd-1234-5678"
|
||||
seg.mkdir()
|
||||
(seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
|
||||
|
||||
assert not _segment_appears_healthy(str(seg))
|
||||
|
||||
|
||||
def test_segment_without_metadata_and_tiny_data_is_still_treated_as_fresh(tmp_path):
|
||||
"""Tiny data payloads can occur before metadata has flushed; leave them alone."""
|
||||
|
||||
seg = tmp_path / "abcd-1234-5678"
|
||||
seg.mkdir()
|
||||
(seg / "data_level0.bin").write_bytes(b"\0" * _HNSW_MISSING_METADATA_DATA_FLOOR)
|
||||
|
||||
assert _segment_appears_healthy(str(seg))
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_missing_metadata_with_nontrivial_data(tmp_path):
|
||||
"""Regression for #1274: missing pickle + non-trivial data must quarantine."""
|
||||
|
||||
now = 1_700_000_000.0
|
||||
palace, seg = _make_palace_with_segment(
|
||||
tmp_path,
|
||||
hnsw_mtime=now - 7200,
|
||||
sqlite_mtime=now,
|
||||
meta_bytes=None,
|
||||
)
|
||||
(seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
|
||||
os.utime(seg / "data_level0.bin", (now - 7200, now - 7200))
|
||||
|
||||
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
|
||||
|
||||
assert len(moved) == 1
|
||||
assert ".drift-" in moved[0]
|
||||
assert not seg.exists()
|
||||
|
||||
drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
|
||||
assert len(drift_dirs) == 1
|
||||
assert (drift_dirs[0] / "data_level0.bin").exists()
|
||||
|
||||
|
||||
def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
|
||||
"""Segment with a truncated (under-floor-size) metadata file is
|
||||
quarantined — shape of a partial-flush during process kill."""
|
||||
|
||||
@@ -1153,6 +1153,72 @@ def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collectio
|
||||
mock_shutil.copy2.assert_not_called()
|
||||
|
||||
|
||||
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
|
||||
"""#1295: default repair preflight must not drop queued writes."""
|
||||
|
||||
palace = str(tmp_path / "palace")
|
||||
seg = _seed_poisoned_max_seq_id(
|
||||
palace,
|
||||
drawers_meta_max=102,
|
||||
closets_meta_max=11,
|
||||
)
|
||||
db_path = os.path.join(palace, "chroma.sqlite3")
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
conn.executemany(
|
||||
"INSERT INTO embeddings_queue(seq_id, topic, id) VALUES (?, ?, ?)",
|
||||
[
|
||||
(seq_id, "persistent://default/default/mempalace_drawers", f"queued-{seq_id}")
|
||||
for seq_id in range(103, 123)
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
result = repair.maybe_repair_poisoned_max_seq_id_before_rebuild(
|
||||
palace,
|
||||
assume_yes=True,
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["segment_repaired"]
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
max_seq_rows = dict(conn.execute("SELECT segment_id, seq_id FROM max_seq_id"))
|
||||
queue_count = conn.execute("SELECT COUNT(*) FROM embeddings_queue").fetchone()[0]
|
||||
|
||||
assert max_seq_rows[seg["drawers_vec"]] == seg["drawers_meta_max"]
|
||||
assert max_seq_rows[seg["drawers_meta"]] == seg["drawers_meta_max"]
|
||||
assert max_seq_rows[seg["closets_vec"]] == seg["closets_meta_max"]
|
||||
assert max_seq_rows[seg["closets_meta"]] == seg["closets_meta_max"]
|
||||
|
||||
# The old legacy rebuild path can discard queued writes. The preflight
|
||||
# repair must leave them on disk for Chroma to drain after the bookmark is
|
||||
# unpoisoned.
|
||||
assert queue_count == 20
|
||||
|
||||
|
||||
def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp_path, capsys):
|
||||
"""A poisoned bookmark should short-circuit before the legacy rebuild path."""
|
||||
|
||||
palace = str(tmp_path / "palace")
|
||||
_seed_poisoned_max_seq_id(palace)
|
||||
|
||||
with patch("mempalace.repair.ChromaBackend") as mock_backend:
|
||||
repair.rebuild_index(palace)
|
||||
|
||||
out = capsys.readouterr().out
|
||||
backend = mock_backend.return_value
|
||||
|
||||
# repair_max_seq_id may instantiate ChromaBackend to close cached clients
|
||||
# after editing sqlite directly. That is safe. The important thing is that
|
||||
# rebuild_index must not continue into the legacy Chroma collection read /
|
||||
# count / rebuild path after the max_seq_id preflight handles the issue.
|
||||
backend.get_collection.assert_not_called()
|
||||
|
||||
assert "Detected poisoned max_seq_id rows" in out
|
||||
assert "non-destructive max_seq_id repair" in out
|
||||
|
||||
|
||||
# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
|
||||
#
|
||||
# These tests build real chromadb palaces in tmp_path rather than mocking
|
||||
|
||||
Reference in New Issue
Block a user