fix(migrate): harden swap rollback against partial cross-device copy

shutil.move() can partially create palace_path before raising, which would
trip a bare os.replace(stale_path, palace_path) rollback (dest exists).

- Switch the primary swap to os.replace so same-filesystem moves stay atomic
- Branch on errno.EXDEV before falling back to shutil.move, so real errors
  (permissions, EIO) surface instead of silently attempting a slow copy
- Extract rollback into _restore_stale_palace which clears any partial
  destination and, if the restore itself fails, logs both stale_path and
  palace_path so the operator can recover by hand

Adds three regression tests covering clean rollback, partial-copy cleanup,
and logged failure on rollback-failure.

Flagged by the Qodo reviewer on #935.
This commit is contained in:
shaun0927
2026-04-24 13:12:10 +09:00
parent fb1cf53919
commit 659cb815ea
2 changed files with 86 additions and 5 deletions
+30 -4
View File
@@ -16,6 +16,7 @@ Usage:
mempalace migrate --dry-run # show what would be migrated mempalace migrate --dry-run # show what would be migrated
""" """
import errno
import os import os
import shutil import shutil
import sqlite3 import sqlite3
@@ -23,6 +24,26 @@ from collections import defaultdict
from datetime import datetime from datetime import datetime
def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
"""Roll back a failed swap.
shutil.move() can partially create palace_path before raising, which
would make a bare os.replace(stale_path, palace_path) fail (dest exists).
Clear any partial destination first, then restore. Best-effort: if the
restore itself fails, log both paths so the operator can recover by hand.
"""
try:
if os.path.lexists(palace_path):
shutil.rmtree(palace_path, ignore_errors=True)
os.replace(stale_path, palace_path)
except Exception as err:
print(
f" CRITICAL: rollback failed — original palace at {stale_path}, "
f"partial migration data at {palace_path}. Restore manually. "
f"({err})"
)
def extract_drawers_from_sqlite(db_path: str) -> list: def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API. """Read all drawers directly from ChromaDB's SQLite, bypassing the API.
@@ -235,14 +256,19 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
stale_path = palace_path + ".old" stale_path = palace_path + ".old"
if os.path.exists(stale_path): if os.path.exists(stale_path):
shutil.rmtree(stale_path) shutil.rmtree(stale_path)
os.rename(palace_path, stale_path) os.replace(palace_path, stale_path)
try: try:
os.rename(temp_palace, palace_path) os.replace(temp_palace, palace_path)
except OSError: except OSError as e:
# EXDEV = temp lives on a different filesystem; fall back to copy+delete.
# Anything else is a real error — don't mask it with shutil.move.
if getattr(e, "errno", None) != errno.EXDEV:
_restore_stale_palace(palace_path, stale_path)
raise
try: try:
shutil.move(temp_palace, palace_path) shutil.move(temp_palace, palace_path)
except Exception: except Exception:
os.rename(stale_path, palace_path) _restore_stale_palace(palace_path, stale_path)
raise raise
shutil.rmtree(stale_path, ignore_errors=True) shutil.rmtree(stale_path, ignore_errors=True)
+56 -1
View File
@@ -1,9 +1,10 @@
"""Tests for destructive-operation safety in mempalace.migrate.""" """Tests for destructive-operation safety in mempalace.migrate."""
import os
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from mempalace.migrate import migrate from mempalace.migrate import _restore_stale_palace, migrate
def test_migrate_requires_palace_database(tmp_path, capsys): def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
assert "Aborted." in out assert "Aborted." in out
mock_copytree.assert_not_called() mock_copytree.assert_not_called()
mock_rmtree.assert_not_called() mock_rmtree.assert_not_called()
def test_restore_stale_palace_with_clean_destination(tmp_path):
"""Rollback when no partial copy exists at palace_path."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not stale_path.exists()
def test_restore_stale_palace_clears_partial_copy(tmp_path):
"""Rollback must remove a partially-copied palace_path before restoring.
Simulates the Qodo-reported hazard: shutil.move() began creating
palace_path, then failed. A bare os.replace(stale, palace_path) would
trip on the existing destination; _restore_stale_palace must clear it.
"""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
palace_path.mkdir()
(palace_path / "half-copied.bin").write_bytes(b"garbage")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not (palace_path / "half-copied.bin").exists()
assert not stale_path.exists()
def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
"""If restore itself fails, log both paths — don't raise from rollback."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
# Force os.replace to fail deterministically.
with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
_restore_stale_palace(str(palace_path), str(stale_path))
out = capsys.readouterr().out
assert "CRITICAL" in out
assert os.fspath(palace_path) in out
assert os.fspath(stale_path) in out