From 7fa27bd23101f0c7dc4fa33f84be0b1d9710dbe2 Mon Sep 17 00:00:00 2001 From: Mika Cohen Date: Thu, 30 Apr 2026 09:31:32 -0600 Subject: [PATCH] fix(repair): rebuild collections through temp staging --- mempalace/repair.py | 159 +++++++++++++++++----- tests/test_repair.py | 314 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 436 insertions(+), 37 deletions(-) diff --git a/mempalace/repair.py b/mempalace/repair.py index 1cd1556..49d6abe 100644 --- a/mempalace/repair.py +++ b/mempalace/repair.py @@ -37,10 +37,13 @@ import time from datetime import datetime from typing import Optional +from chromadb.errors import NotFoundError as ChromaNotFoundError + from .backends.chroma import ChromaBackend, hnsw_capacity_status COLLECTION_NAME = "mempalace_drawers" +REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp" def _get_palace_path(): @@ -83,6 +86,108 @@ def _paginate_ids(col, where=None): return ids +def _extract_drawers(col, total: int, batch_size: int): + all_ids = [] + all_docs = [] + all_metas = [] + offset = 0 + while offset < total: + batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"]) + if not batch["ids"]: + break + all_ids.extend(batch["ids"]) + all_docs.extend(batch["documents"]) + all_metas.extend(batch["metadatas"]) + offset += len(batch["ids"]) + return all_ids, all_docs, all_metas + + +def _verify_collection_count(col, expected: int, label: str) -> None: + actual = col.count() + if actual != expected: + raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}") + + +def _is_missing_collection_value_error(exc: ValueError) -> bool: + message = str(exc).lower() + return "does not exist" in message or "not found" in message + + +def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None: + try: + backend.delete_collection(palace_path, collection_name) + except ValueError as exc: + if _is_missing_collection_value_error(exc): + return + raise + except (FileNotFoundError, ChromaNotFoundError): + return + + +class RebuildCollectionError(RuntimeError): + """Raised when temp rebuild fails, carrying whether the live swap happened.""" + + def __init__(self, message: str, *, live_replaced: bool): + super().__init__(message) + self.live_replaced = live_replaced + + +def _rebuild_collection_via_temp( + backend, + palace_path: str, + all_ids, + all_docs, + all_metas, + batch_size: int, + progress=print, +) -> int: + expected = len(all_ids) + temp_name = REPAIR_TEMP_COLLECTION + live_replaced = False + + try: + _delete_collection_if_exists(backend, palace_path, temp_name) + + progress(f" Building temporary collection: {temp_name}") + temp_col = backend.create_collection(palace_path, temp_name) + staged = 0 + for i in range(0, expected, batch_size): + batch_ids = all_ids[i : i + batch_size] + batch_docs = all_docs[i : i + batch_size] + batch_metas = all_metas[i : i + batch_size] + temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) + staged += len(batch_ids) + progress(f" Staged {staged}/{expected} drawers...") + _verify_collection_count(temp_col, expected, "temporary rebuild") + + progress(" Rebuilding live collection...") + backend.delete_collection(palace_path, COLLECTION_NAME) + live_replaced = True + new_col = backend.create_collection(palace_path, COLLECTION_NAME) + + rebuilt = 0 + for i in range(0, expected, batch_size): + batch_ids = all_ids[i : i + batch_size] + batch_docs = all_docs[i : i + batch_size] + batch_metas = all_metas[i : i + batch_size] + new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) + rebuilt += len(batch_ids) + progress(f" Re-filed {rebuilt}/{expected} drawers...") + _verify_collection_count(new_col, expected, "rebuilt live collection") + + try: + _delete_collection_if_exists(backend, palace_path, temp_name) + except Exception: + pass + return rebuilt + except Exception as exc: + try: + _delete_collection_if_exists(backend, palace_path, temp_name) + except Exception: + pass + raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc + + def scan_palace(palace_path=None, only_wing=None): """Scan the palace for corrupt/unfetchable IDs. @@ -373,18 +478,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): # Extract all drawers in batches print("\n Extracting drawers...") batch_size = 5000 - all_ids = [] - all_docs = [] - all_metas = [] - offset = 0 - while offset < total: - batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"]) - if not batch["ids"]: - break - all_ids.extend(batch["ids"]) - all_docs.extend(batch["documents"]) - all_metas.extend(batch["metadatas"]) - offset += len(batch["ids"]) + all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size) print(f" Extracted {len(all_ids)} drawers") # ── #1208 guard ────────────────────────────────────────────────── @@ -407,28 +501,33 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False): # Rebuild with correct HNSW settings print(" Rebuilding collection with hnsw:space=cosine...") - backend.delete_collection(palace_path, COLLECTION_NAME) - new_col = backend.create_collection(palace_path, COLLECTION_NAME) - - filed = 0 try: - for i in range(0, len(all_ids), batch_size): - batch_ids = all_ids[i : i + batch_size] - batch_docs = all_docs[i : i + batch_size] - batch_metas = all_metas[i : i + batch_size] - new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas) - filed += len(batch_ids) - print(f" Re-filed {filed}/{len(all_ids)} drawers...") - except Exception as e: + filed = _rebuild_collection_via_temp( + backend, + palace_path, + all_ids, + all_docs, + all_metas, + batch_size, + progress=print, + ) + except RebuildCollectionError as e: print(f"\n ERROR during rebuild: {e}") - print(f" Only {filed}/{len(all_ids)} drawers were re-filed.") - if os.path.exists(backup_path): + print(" Rebuild aborted before completion.") + if e.live_replaced and os.path.exists(backup_path): print(f" Restoring from backup: {backup_path}") - backend.delete_collection(palace_path, COLLECTION_NAME) - shutil.copy2(backup_path, sqlite_path) - print(" Backup restored. Palace is back to pre-repair state.") - else: + try: + _close_chroma_handles(palace_path) + _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME) + shutil.copy2(backup_path, sqlite_path) + print(" Backup restored. Palace is back to pre-repair state.") + except Exception as restore_error: + print(f" Backup restore failed: {restore_error}") + print(f" Manual restore required from: {backup_path}") + elif e.live_replaced: print(" No backup available. Re-mine from source files to recover.") + else: + print(" Live collection was not replaced; leaving the original palace untouched.") raise print(f"\n Repair complete. {filed} drawers rebuilt.") diff --git a/tests/test_repair.py b/tests/test_repair.py index bc770dd..33daad9 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -2,7 +2,7 @@ import os import sqlite3 -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch import pytest @@ -229,8 +229,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path): } mock_new_col = MagicMock() + mock_new_col.count.return_value = 2 + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 mock_backend = _install_mock_backend(mock_backend_cls, mock_col) - mock_backend.create_collection.return_value = mock_new_col + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] repair.rebuild_index(palace_path=str(tmp_path)) @@ -239,14 +242,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path): assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args) # Verify: deleted and recreated (cosine is the backend default) - mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers") - mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers") + assert mock_backend.create_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + ] + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + ] # Verify: used upsert not add + mock_temp_col.upsert.assert_called_once() mock_new_col.upsert.assert_called_once() mock_new_col.add.assert_not_called() +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_ignores_missing_temp_collection_at_start( + mock_backend_cls, mock_shutil, tmp_path +): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + def _fake_copy2(src, dst): + with open(dst, "w") as handle: + handle.write("backup") + + mock_shutil.copy2.side_effect = _fake_copy2 + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + + mock_new_col = MagicMock() + mock_new_col.count.return_value = 2 + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] + mock_backend.delete_collection.side_effect = [ + ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"), + None, + None, + ] + + repair.rebuild_index(palace_path=str(tmp_path)) + + assert mock_shutil.copy2.call_count == 1 + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + ] + + +def test_delete_collection_if_exists_reraises_unexpected_value_error(): + mock_backend = MagicMock() + mock_backend.delete_collection.side_effect = ValueError("invalid collection name") + + with pytest.raises(ValueError, match="invalid collection name"): + repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name") + + @patch("mempalace.repair.shutil") @patch("mempalace.repair.ChromaBackend") def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path): @@ -365,19 +428,256 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp }, {"ids": [], "documents": [], "metadatas": []}, ] + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 10_000 mock_new_col = MagicMock() + mock_new_col.count.return_value = 10_000 mock_backend.get_collection.return_value = mock_col - mock_backend.create_collection.return_value = mock_new_col + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] mock_backend_cls.return_value = mock_backend with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580): repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True) - mock_backend.delete_collection.assert_called_once() - mock_backend.create_collection.assert_called_once() + assert mock_backend.delete_collection.call_count == 3 + assert mock_backend.create_collection.call_count == 2 + mock_temp_col.upsert.assert_called() mock_new_col.upsert.assert_called() +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_stage_failure_leaves_live_collection_untouched( + mock_backend_cls, mock_shutil, tmp_path +): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 1 + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.return_value = mock_temp_col + + with pytest.raises(repair.RebuildCollectionError) as excinfo: + repair.rebuild_index(palace_path=str(tmp_path)) + + assert excinfo.value.live_replaced is False + assert mock_shutil.copy2.call_count == 1 + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + ] + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + def _fake_copy2(src, dst): + with open(dst, "w") as handle: + handle.write("backup") + + mock_shutil.copy2.side_effect = _fake_copy2 + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_new_col = MagicMock() + mock_new_col.upsert.side_effect = RuntimeError("live upsert failed") + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] + + with pytest.raises(repair.RebuildCollectionError) as excinfo: + repair.rebuild_index(palace_path=str(tmp_path)) + + assert excinfo.value.live_replaced is True + assert mock_shutil.copy2.call_count == 2 + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + ] + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_live_delete_missing_still_restores_backup( + mock_backend_cls, mock_shutil, tmp_path +): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + def _fake_copy2(src, dst): + with open(dst, "w") as handle: + handle.write("backup") + + mock_shutil.copy2.side_effect = _fake_copy2 + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")] + mock_backend.delete_collection.side_effect = [ + None, + None, + None, + repair.ChromaNotFoundError("missing"), + ] + + with pytest.raises(repair.RebuildCollectionError) as excinfo: + repair.rebuild_index(palace_path=str(tmp_path)) + + assert excinfo.value.live_replaced is True + assert mock_shutil.copy2.call_count == 2 + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + ] + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_restore_failure_preserves_original_error( + mock_backend_cls, mock_shutil, tmp_path, capsys +): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + def _copy2_side_effect(src, dst): + if str(src).endswith(".backup"): + raise PermissionError("locked sqlite") + with open(dst, "w") as handle: + handle.write("backup") + + mock_shutil.copy2.side_effect = _copy2_side_effect + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_new_col = MagicMock() + mock_new_col.upsert.side_effect = RuntimeError("live upsert failed") + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] + + with pytest.raises(repair.RebuildCollectionError) as excinfo: + repair.rebuild_index(palace_path=str(tmp_path)) + + out = capsys.readouterr().out + assert "locked sqlite" in out + assert "Manual restore required" in out + assert "live upsert failed" in str(excinfo.value) + + +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails( + mock_backend_cls, +): + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")] + mock_backend.delete_collection.side_effect = [ + None, + None, + RuntimeError("cleanup failed"), + ] + + with pytest.raises(repair.RebuildCollectionError) as excinfo: + repair._rebuild_collection_via_temp( + mock_backend, + "/palace", + ["id1", "id2"], + ["doc1", "doc2"], + [{"wing": "a"}, {"wing": "b"}], + batch_size=5000, + progress=lambda *args, **kwargs: None, + ) + + assert "live build failed" in str(excinfo.value) + assert excinfo.value.live_replaced is True + assert mock_backend.delete_collection.call_args_list == [ + call("/palace", "mempalace_drawers__repair_tmp"), + call("/palace", "mempalace_drawers"), + call("/palace", "mempalace_drawers__repair_tmp"), + ] + + +@patch("mempalace.repair.shutil") +@patch("mempalace.repair.ChromaBackend") +def test_rebuild_index_ignores_temp_cleanup_failure_after_success( + mock_backend_cls, mock_shutil, tmp_path +): + sqlite_path = tmp_path / "chroma.sqlite3" + sqlite_path.write_text("fake") + + def _fake_copy2(src, dst): + with open(dst, "w") as handle: + handle.write("backup") + + mock_shutil.copy2.side_effect = _fake_copy2 + + mock_col = MagicMock() + mock_col.count.return_value = 2 + mock_col.get.return_value = { + "ids": ["id1", "id2"], + "documents": ["doc1", "doc2"], + "metadatas": [{"wing": "a"}, {"wing": "b"}], + } + mock_temp_col = MagicMock() + mock_temp_col.count.return_value = 2 + mock_new_col = MagicMock() + mock_new_col.count.return_value = 2 + mock_backend = _install_mock_backend(mock_backend_cls, mock_col) + mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col] + mock_backend.delete_collection.side_effect = [ + None, + None, + RuntimeError("cleanup failed"), + ] + + repair.rebuild_index(palace_path=str(tmp_path)) + + assert mock_shutil.copy2.call_count == 1 + assert mock_backend.delete_collection.call_args_list == [ + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + call(str(tmp_path), "mempalace_drawers"), + call(str(tmp_path), "mempalace_drawers__repair_tmp"), + ] + + # ── repair_max_seq_id ─────────────────────────────────────────────────