fix(repair): preflight poisoned max_seq_id

This commit is contained in:
fatkobra
2026-05-05 07:22:10 +00:00
parent 1888b671e2
commit 37e7d394b8
3 changed files with 140 additions and 4 deletions
+16 -3
View File
@@ -654,7 +654,11 @@ def cmd_repair(args):
import shutil import shutil
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
from .migrate import confirm_destructive_action, contains_palace_database from .migrate import confirm_destructive_action, contains_palace_database
from .repair import TruncationDetected, check_extraction_safety from .repair import (
TruncationDetected,
check_extraction_safety,
maybe_repair_poisoned_max_seq_id_before_rebuild,
)
palace_path = os.path.abspath( palace_path = os.path.abspath(
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -679,11 +683,20 @@ def cmd_repair(args):
print(f"\n No palace found at {palace_path}") print(f"\n No palace found at {palace_path}")
return return
if not contains_palace_database(palace_path): if not contains_palace_database(palace_path):
print(f"\n No palace database found at {db_path}") print(f"\n No palace database found at {db_path}")
return
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path,
backup=getattr(args, "backup", True),
dry_run=getattr(args, "dry_run", False),
assume_yes=getattr(args, "yes", False),
)
if preflight is not None:
return return
print(f"\n{'=' * 55}") print(f"\n{'=' * 55}")
print(" MemPalace Repair") print(" MemPalace Repair")
print(f"{'=' * 55}\n") print(f"{'=' * 55}\n")
print(f" Palace: {palace_path}") print(f" Palace: {palace_path}")
+58 -1
View File
@@ -330,6 +330,56 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
return None return None
def maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path: str,
*,
backup: bool = True,
dry_run: bool = False,
assume_yes: bool = False,
) -> "dict | None":
"""Run non-destructive max_seq_id repair before a rebuild if needed.
A poisoned ``max_seq_id`` row can make Chroma believe it has already
consumed every row in ``embeddings_queue``. Writes then report success
because they land in the queue, but they never become visible in
``embeddings``.
If this precise corruption is present, do the narrow bookmark repair and
stop instead of continuing into the legacy rebuild path. The rebuild path
extracts only already-visible embeddings and can discard queued writes.
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return None
try:
poisoned = _detect_poisoned_max_seq_ids(db_path)
except Exception:
return None
if not poisoned:
return None
print("\n Detected poisoned max_seq_id rows before repair rebuild.")
print(
" This can make writes report success while embeddings_queue grows "
"and embeddings stay static."
)
print(" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection.")
print(
" Queued writes remain in chroma.sqlite3 for Chroma to drain after "
"the bookmark is unpoisoned."
)
return repair_max_seq_id(
palace_path,
backup=backup,
dry_run=dry_run,
assume_yes=assume_yes,
)
def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
"""Rebuild the HNSW index from scratch. """Rebuild the HNSW index from scratch.
@@ -353,7 +403,14 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
print(f"\n{'=' * 55}") print(f"\n{'=' * 55}")
print(" MemPalace Repair — Index Rebuild") print(" MemPalace Repair — Index Rebuild")
print(f"{'=' * 55}\n") print(f"{'=' * 55}\n")
print(f" Palace: {palace_path}") print(f" Palace: {palace_path}")
preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
palace_path,
assume_yes=True,
)
if preflight is not None:
return
backend = ChromaBackend() backend = ChromaBackend()
try: try:
+66
View File
@@ -682,3 +682,69 @@ def test_max_seq_id_rollback_on_verification_failure(tmp_path, monkeypatch):
# A backup file is still present — caller can roll back from it. # A backup file is still present — caller can roll back from it.
leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn] leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
assert leftover assert leftover
def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
"""#1295: default repair preflight must not drop queued writes."""
palace = str(tmp_path / "palace")
seg = _seed_poisoned_max_seq_id(
palace,
drawers_meta_max=102,
closets_meta_max=11,
)
db_path = os.path.join(palace, "chroma.sqlite3")
with sqlite3.connect(db_path) as conn:
conn.executemany(
"INSERT INTO embeddings_queue(seq_id, topic, id) VALUES (?, ?, ?)",
[
(seq_id, "persistent://default/default/mempalace_drawers", f"queued-{seq_id}")
for seq_id in range(103, 123)
],
)
conn.commit()
result = repair.maybe_repair_poisoned_max_seq_id_before_rebuild(
palace,
assume_yes=True,
)
assert result is not None
assert result["segment_repaired"]
with sqlite3.connect(db_path) as conn:
max_seq_rows = dict(conn.execute("SELECT segment_id, seq_id FROM max_seq_id"))
queue_count = conn.execute("SELECT COUNT(*) FROM embeddings_queue").fetchone()[0]
assert max_seq_rows[seg["drawers_vec"]] == seg["drawers_meta_max"]
assert max_seq_rows[seg["drawers_meta"]] == seg["drawers_meta_max"]
assert max_seq_rows[seg["closets_vec"]] == seg["closets_meta_max"]
assert max_seq_rows[seg["closets_meta"]] == seg["closets_meta_max"]
# The old legacy rebuild path can discard queued writes. The preflight
# repair must leave them on disk for Chroma to drain after the bookmark is
# unpoisoned.
assert queue_count == 20
def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp_path, capsys):
"""A poisoned bookmark should short-circuit before the legacy rebuild path."""
palace = str(tmp_path / "palace")
_seed_poisoned_max_seq_id(palace)
with patch("mempalace.repair.ChromaBackend") as mock_backend:
repair.rebuild_index(palace)
out = capsys.readouterr().out
backend = mock_backend.return_value
# repair_max_seq_id may instantiate ChromaBackend to close cached clients
# after editing sqlite directly. That is safe. The important thing is that
# rebuild_index must not continue into the legacy Chroma collection read /
# count / rebuild path after the max_seq_id preflight handles the issue.
backend.get_collection.assert_not_called()
assert "Detected poisoned max_seq_id rows" in out
assert "non-destructive max_seq_id repair" in out