fix(migrate): harden swap rollback against partial cross-device copy

shutil.move() can partially create palace_path before raising, which would
trip a bare os.replace(stale_path, palace_path) rollback (dest exists).

- Switch the primary swap to os.replace so same-filesystem moves stay atomic
- Branch on errno.EXDEV before falling back to shutil.move, so real errors
  (permissions, EIO) surface instead of silently attempting a slow copy
- Extract rollback into _restore_stale_palace which clears any partial
  destination and, if the restore itself fails, logs both stale_path and
  palace_path so the operator can recover by hand

Adds three regression tests covering clean rollback, partial-copy cleanup,
and logged failure on rollback-failure.

Flagged by the Qodo reviewer on #935.
This commit is contained in:
shaun0927
2026-04-24 13:12:10 +09:00
parent fb1cf53919
commit 659cb815ea
2 changed files with 86 additions and 5 deletions
+56 -1
View File
@@ -1,9 +1,10 @@
"""Tests for destructive-operation safety in mempalace.migrate."""
import os
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
from mempalace.migrate import migrate
from mempalace.migrate import _restore_stale_palace, migrate
def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
assert "Aborted." in out
mock_copytree.assert_not_called()
mock_rmtree.assert_not_called()
def test_restore_stale_palace_with_clean_destination(tmp_path):
"""Rollback when no partial copy exists at palace_path."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not stale_path.exists()
def test_restore_stale_palace_clears_partial_copy(tmp_path):
"""Rollback must remove a partially-copied palace_path before restoring.
Simulates the Qodo-reported hazard: shutil.move() began creating
palace_path, then failed. A bare os.replace(stale, palace_path) would
trip on the existing destination; _restore_stale_palace must clear it.
"""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
(stale_path / "chroma.sqlite3").write_bytes(b"original")
palace_path.mkdir()
(palace_path / "half-copied.bin").write_bytes(b"garbage")
_restore_stale_palace(str(palace_path), str(stale_path))
assert palace_path.is_dir()
assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
assert not (palace_path / "half-copied.bin").exists()
assert not stale_path.exists()
def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
"""If restore itself fails, log both paths — don't raise from rollback."""
palace_path = tmp_path / "palace"
stale_path = tmp_path / "palace.old"
stale_path.mkdir()
# Force os.replace to fail deterministically.
with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
_restore_stale_palace(str(palace_path), str(stale_path))
out = capsys.readouterr().out
assert "CRITICAL" in out
assert os.fspath(palace_path) in out
assert os.fspath(stale_path) in out