From ef8d83cc8ac295ceae70982b40842b2727413236 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 8 May 2026 01:00:00 -0300 Subject: [PATCH 1/3] fix(mine): identify lock holder + exit non-zero on contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a `mempalace mine` collided with another writer (live mcp_server, another mine, anything taking mine_palace_lock), the operator saw a generic "another `mempalace mine` is already running" message and the CLI exited 0 — making the contention invisible to nohup or scripts checking $?. The reporter ran a `nohup mempalace mine ... & disown` and got a 200-byte log with only the auto-defaults warning, no clue that an MCP server was holding the store. palace.py: the lock file now records the holder's PID + first three argv tokens on acquire. A failed acquire reads the file and surfaces "palace is held by PID N (mempalace mcp_server); wait for it to finish or stop the holder before retrying" in the MineAlreadyRunning message. Open mode changes from "w" to "a+" so the prior holder's identity survives long enough to be read. miner.mine() now lets MineAlreadyRunning propagate. cmd_mine catches it, prints the holder-aware message to stderr, and exits non-zero so shell wrappers detect the contention. Note: this is a behavior change for in-process callers that depended on miner.mine() silently swallowing MineAlreadyRunning. The silent swallow was the bug. Closes #1264 --- mempalace/cli.py | 56 ++++++++++++++++----------- mempalace/miner.py | 34 +++++++--------- mempalace/palace.py | 51 ++++++++++++++++++++++-- tests/test_cli.py | 39 +++++++++++++++++++ tests/test_palace_locks.py | 79 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 213 insertions(+), 46 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index 6a531e7..964fa84 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -500,31 +500,41 @@ def cmd_mine(args): llm_provider=None, ) - if args.mode == "convos": - from .convo_miner import mine_convos + from .palace import MineAlreadyRunning - mine_convos( - convo_dir=args.dir, - palace_path=palace_path, - wing=args.wing, - agent=args.agent, - limit=args.limit, - dry_run=args.dry_run, - extract_mode=args.extract, - ) - else: - from .miner import mine + try: + if args.mode == "convos": + from .convo_miner import mine_convos - mine( - project_dir=args.dir, - palace_path=palace_path, - wing_override=args.wing, - agent=args.agent, - limit=args.limit, - dry_run=args.dry_run, - respect_gitignore=not args.no_gitignore, - include_ignored=include_ignored, - ) + mine_convos( + convo_dir=args.dir, + palace_path=palace_path, + wing=args.wing, + agent=args.agent, + limit=args.limit, + dry_run=args.dry_run, + extract_mode=args.extract, + ) + else: + from .miner import mine + + mine( + project_dir=args.dir, + palace_path=palace_path, + wing_override=args.wing, + agent=args.agent, + limit=args.limit, + dry_run=args.dry_run, + respect_gitignore=not args.no_gitignore, + include_ignored=include_ignored, + ) + except MineAlreadyRunning as exc: + # A live MCP server or another mine is already writing to this + # palace. Surface the holder identity so the operator knows what + # to wait for (or stop), and exit non-zero so wrappers like + # nohup / scripts can detect the contention. + print(f"mempalace: {exc}", file=sys.stderr) + sys.exit(1) def cmd_sweep(args): diff --git a/mempalace/miner.py b/mempalace/miner.py index 6aeddd4..09cc517 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -21,7 +21,6 @@ from typing import Optional from .palace import ( NORMALIZE_VERSION, SKIP_DIRS, - MineAlreadyRunning, build_closet_lines, file_already_mined, get_closets_collection, @@ -1035,26 +1034,21 @@ def mine( files=files, ) - try: - with mine_palace_lock(palace_path): - return _mine_impl( - project_dir, - palace_path, - wing_override=wing_override, - agent=agent, - limit=limit, - dry_run=dry_run, - respect_gitignore=respect_gitignore, - include_ignored=include_ignored, - files=files, - ) - except MineAlreadyRunning: - print( - f"mempalace: another `mine` is already running against " - f"{palace_path} — exiting cleanly.", - file=sys.stderr, + # MineAlreadyRunning propagates so the CLI can render a clear holder-aware + # message and exit non-zero. In-process callers (tests, library users) that + # expect to coexist with another writer should handle the exception. + with mine_palace_lock(palace_path): + return _mine_impl( + project_dir, + palace_path, + wing_override=wing_override, + agent=agent, + limit=limit, + dry_run=dry_run, + respect_gitignore=respect_gitignore, + include_ignored=include_ignored, + files=files, ) - return def _mine_impl( diff --git a/mempalace/palace.py b/mempalace/palace.py index dee5c8f..375b5e1 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -9,6 +9,7 @@ import hashlib import logging import os import re +import sys import threading from typing import Optional @@ -364,6 +365,41 @@ def _mark_released(lock_key: str) -> None: _holder_state().discard(lock_key) +def _format_lock_holder(content: str) -> str: + """Render a lock-file body as 'PID N (cmdline)' for diagnostic messages.""" + parts = content.split(maxsplit=1) + if not parts or not parts[0].isdigit(): + return "another writer (identity not recorded)" + pid = parts[0] + if len(parts) > 1 and parts[1].strip(): + return f"PID {pid} ({parts[1].strip()})" + return f"PID {pid}" + + +def _read_lock_holder(lock_file) -> str: + """Read the prior holder's identity from the lock-file body, best-effort.""" + try: + lock_file.seek(0) + content = lock_file.read().strip() + except OSError: + return "another writer (identity not recorded)" + if not content: + return "another writer (identity not recorded)" + return _format_lock_holder(content) + + +def _write_lock_holder(lock_file) -> None: + """Record this process's identity in the lock-file body. Best-effort.""" + try: + ident = f"{os.getpid()} {' '.join(sys.argv[:3])}".strip() + lock_file.seek(0) + lock_file.truncate() + lock_file.write(ident) + lock_file.flush() + except OSError: + pass + + @contextlib.contextmanager def mine_palace_lock(palace_path: str): """Per-palace non-blocking lock around the full `mine` pipeline. @@ -407,7 +443,10 @@ def mine_palace_lock(palace_path: str): yield return - lf = open(lock_path, "w") + # "a+" preserves the prior holder's identity recorded inside the file so + # a failed acquire can name who is holding the lock (#1264). "w" mode + # would have truncated the file before we could read it. + lf = open(lock_path, "a+") acquired = False try: if os.name == "nt": @@ -417,8 +456,10 @@ def mine_palace_lock(palace_path: str): msvcrt.locking(lf.fileno(), msvcrt.LK_NBLCK, 1) acquired = True except OSError as exc: + holder = _read_lock_holder(lf) raise MineAlreadyRunning( - f"another `mempalace mine` is already running against {resolved}" + f"palace {resolved} is held by {holder}; " + "wait for it to finish or stop the holder before retrying" ) from exc else: import fcntl @@ -427,9 +468,13 @@ def mine_palace_lock(palace_path: str): fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB) acquired = True except BlockingIOError as exc: + holder = _read_lock_holder(lf) raise MineAlreadyRunning( - f"another `mempalace mine` is already running against {resolved}" + f"palace {resolved} is held by {holder}; " + "wait for it to finish or stop the holder before retrying" ) from exc + # Record our own identity for any later contender's diagnostic message. + _write_lock_holder(lf) _mark_held(palace_key) try: yield diff --git a/tests/test_cli.py b/tests/test_cli.py index fa5680d..547286d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -555,6 +555,45 @@ def test_cmd_mine_include_ignored_comma_split(mock_config_cls): assert call_kwargs["include_ignored"] == ["a.txt", "b.txt", "c.txt"] +@patch("mempalace.cli.MempalaceConfig") +def test_cmd_mine_exits_nonzero_on_lock_holder(mock_config_cls, capsys): + """Regression #1264: lock contention must exit non-zero with a clear message. + + Before this fix the CLI silently returned 0 when another writer held + the palace lock — operators using nohup/scripts had no way to detect + the contention. The new behavior raises MineAlreadyRunning out of + miner.mine() and cmd_mine catches it, printing the holder identity + to stderr and exiting non-zero. + """ + from mempalace.palace import MineAlreadyRunning + + mock_config_cls.return_value.palace_path = "/fake/palace" + args = argparse.Namespace( + dir="/src", + palace=None, + mode="projects", + wing=None, + agent="mempalace", + limit=0, + dry_run=False, + no_gitignore=False, + include_ignored=[], + extract="exchange", + ) + with patch( + "mempalace.miner.mine", + side_effect=MineAlreadyRunning( + "palace /fake/palace is held by PID 12345 (mempalace mcp_server); wait for it to finish" + ), + ): + with pytest.raises(SystemExit) as excinfo: + cmd_mine(args) + assert excinfo.value.code == 1 + captured = capsys.readouterr() + assert "PID 12345" in captured.err + assert "mcp_server" in captured.err + + # ── cmd_wakeup ───────────────────────────────────────────────────────── diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py index d239757..27235dd 100644 --- a/tests/test_palace_locks.py +++ b/tests/test_palace_locks.py @@ -208,6 +208,85 @@ def _try_acquire_expect_busy(palace_path, result_q): result_q.put("busy") +def _hold_lock_send_pid(palace_path: str, ready_flag: str, release_flag: str, pid_q) -> None: + """Acquire the lock, push our PID + cmdline through the queue, then wait.""" + import sys as _sys + + try: + with mine_palace_lock(palace_path): + pid_q.put((os.getpid(), list(_sys.argv[:3]))) + open(ready_flag, "w").close() + for _ in range(500): + if os.path.exists(release_flag): + return + time.sleep(0.01) + except MineAlreadyRunning: + pid_q.put(("error", "raised")) + + +def test_lock_failure_message_names_holder(tmp_path, monkeypatch): + """Regression #1264: failed acquire must identify the holder by PID. + + Before this fix, a `mempalace mine` colliding with another writer + (mine, MCP server, anything taking mine_palace_lock) saw a generic + "another `mempalace mine` is already running" message and exited + silently. The operator had no signal of which process to wait for + or stop. The new message includes ``PID N`` so the holder can be + identified directly. + """ + monkeypatch.setenv("HOME", str(tmp_path)) + palace = str(tmp_path / "palace") + ready = str(tmp_path / "ready") + release = str(tmp_path / "release") + + ctx = _get_mp_context() + pid_q = ctx.Queue() + holder = ctx.Process(target=_hold_lock_send_pid, args=(palace, ready, release, pid_q)) + holder.start() + try: + for _ in range(500): + if os.path.exists(ready): + break + time.sleep(0.01) + assert os.path.exists(ready), "holder failed to acquire lock in time" + holder_pid, _holder_argv = pid_q.get(timeout=2) + + with pytest.raises(MineAlreadyRunning) as excinfo: + with mine_palace_lock(palace): + pytest.fail("second acquire of same palace should have raised") + + msg = str(excinfo.value) + assert ( + f"PID {holder_pid}" in msg + ), f"lock-failure message must name the holder PID; got: {msg!r}" + finally: + open(release, "w").close() + holder.join(timeout=5) + + +def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch): + """The holder line is overwritten by each new acquirer, not appended. + + Without explicit truncate the lock file would accumulate lines across + runs and grow without bound. Verify that re-acquire keeps the body + bounded. + """ + monkeypatch.setenv("HOME", str(tmp_path)) + palace = str(tmp_path / "palace") + for _ in range(5): + with mine_palace_lock(palace): + pass + + # Locate the lock file. The key derivation is internal but we can find + # it by scanning the mempalace locks dir for mine_palace_*.lock entries. + lock_dir = tmp_path / ".mempalace" / "locks" + lock_files = list(lock_dir.glob("mine_palace_*.lock")) + assert lock_files, "expected the palace lock file to exist after acquire/release" + body = lock_files[0].read_text() + # One identity line, no accumulation. + assert body.count("\n") <= 1, f"lock body must not grow across re-acquires; got {body!r}" + + def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch): """Old callers of `mine_global_lock` should still work.""" monkeypatch.setenv("HOME", str(tmp_path)) From d5ce97c7afe0c533a3a62ec3929c14563ef5a01d Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 8 May 2026 01:28:42 -0300 Subject: [PATCH 2/3] fix(palace): reserve byte 0 as lock sentinel for Windows portability Windows CI surfaced two bugs introduced by the holder-identity write: 1. msvcrt.locking(LK_NBLCK, 1) locks 1 byte at the *current* file position. Switching to "a+" mode put the position at end-of-file, so two contenders locked different bytes and silently both acquired (the test asserts saw [(ok, 1), (ok, 2)] instead of ok+busy). 2. With the byte-range lock active on Windows, the locked byte is read-blocked for other processes. A contender trying to read the holder identity from byte 0 would hit PermissionError. Switch to "r+" mode (after touch-create) and explicitly seek(0) before both lock and unlock. Then reserve byte 0 as a pure lock sentinel and write the holder identity from byte 1 onward. _read_lock_holder reads from byte 1+, so it never touches the locked byte. Also bound file growth across re-acquires: truncate to sentinel + len(ident) before writing so the file body stays the size of the current holder, never accumulating across runs. Linux fcntl.flock locks the whole file independent of byte position, so the seek(0) is harmless on POSIX. The shape works on both. --- mempalace/palace.py | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/mempalace/palace.py b/mempalace/palace.py index 375b5e1..7ed315c 100644 --- a/mempalace/palace.py +++ b/mempalace/palace.py @@ -376,10 +376,17 @@ def _format_lock_holder(content: str) -> str: return f"PID {pid}" +# Byte 0 of the lock file is reserved as the OS lock sentinel. +# Holder identity is written from byte 1 onward so contenders can read +# the identity without colliding with byte 0 (Windows msvcrt.locking +# blocks both reads and writes on the locked byte). +_LOCK_SENTINEL_BYTES = 1 + + def _read_lock_holder(lock_file) -> str: """Read the prior holder's identity from the lock-file body, best-effort.""" try: - lock_file.seek(0) + lock_file.seek(_LOCK_SENTINEL_BYTES) content = lock_file.read().strip() except OSError: return "another writer (identity not recorded)" @@ -389,11 +396,16 @@ def _read_lock_holder(lock_file) -> str: def _write_lock_holder(lock_file) -> None: - """Record this process's identity in the lock-file body. Best-effort.""" + """Record this process's identity in the lock-file body. Best-effort. + + Writes from byte 1 onward; byte 0 is the lock sentinel and must not + be touched after acquire (truncating it on Windows can interact + badly with the active byte-range lock). + """ try: ident = f"{os.getpid()} {' '.join(sys.argv[:3])}".strip() - lock_file.seek(0) - lock_file.truncate() + lock_file.seek(_LOCK_SENTINEL_BYTES) + lock_file.truncate(_LOCK_SENTINEL_BYTES + len(ident.encode("utf-8"))) lock_file.write(ident) lock_file.flush() except OSError: @@ -443,12 +455,27 @@ def mine_palace_lock(palace_path: str): yield return - # "a+" preserves the prior holder's identity recorded inside the file so - # a failed acquire can name who is holding the lock (#1264). "w" mode - # would have truncated the file before we could read it. - lf = open(lock_path, "a+") + # Ensure the file exists, then open r+ so we can both read the prior + # holder's identity (for failure diagnostics) and write our own. "w" + # truncates and erases the prior holder. "a+" puts the position at EOF, + # which on Windows breaks ``msvcrt.locking`` (it locks 1 byte at the + # *current* position, so two contenders end up locking different bytes + # and silently both acquire — observed as Windows-CI lock test + # failures during #1264 development). + if not os.path.exists(lock_path): + # Touch atomically: O_CREAT|O_EXCL would fail if a concurrent + # contender just created it, which is fine — we proceed to open. + try: + fd = os.open(lock_path, os.O_CREAT | os.O_WRONLY, 0o600) + os.close(fd) + except FileExistsError: + pass + lf = open(lock_path, "r+") acquired = False try: + # Lock byte 0 explicitly. msvcrt.locking is byte-position dependent; + # fcntl.flock is whole-file but the seek is harmless there. + lf.seek(0) if os.name == "nt": import msvcrt @@ -486,6 +513,8 @@ def mine_palace_lock(palace_path: str): if os.name == "nt": import msvcrt + # Match the lock region: byte 0. + lf.seek(0) msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) else: import fcntl From 11a35de5ac4758474fbae59b1f348d4f8762130f Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 8 May 2026 01:34:46 -0300 Subject: [PATCH 3/3] test(palace): set USERPROFILE too so the lock-path test works on Windows os.path.expanduser("~") reads HOME on POSIX but USERPROFILE on Windows; the lock-body bound test was monkeypatching HOME only, so on test-windows the lock file landed in the runner's real ~/.mempalace and the tmp_path glob found nothing. Patch USERPROFILE in addition to HOME, and read the body as bytes so the byte-0 sentinel doesn't trip a UTF-8 decode warning. Assertion shifts from line-count to size-bound (still detects unbounded growth across re-acquires). --- tests/test_palace_locks.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py index 27235dd..2e9f82f 100644 --- a/tests/test_palace_locks.py +++ b/tests/test_palace_locks.py @@ -271,7 +271,11 @@ def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch): runs and grow without bound. Verify that re-acquire keeps the body bounded. """ + # ``os.path.expanduser("~")`` reads HOME on POSIX but USERPROFILE on + # Windows; setting both makes the ``~/.mempalace/locks`` lookup land + # under ``tmp_path`` regardless of platform. monkeypatch.setenv("HOME", str(tmp_path)) + monkeypatch.setenv("USERPROFILE", str(tmp_path)) palace = str(tmp_path / "palace") for _ in range(5): with mine_palace_lock(palace): @@ -282,9 +286,13 @@ def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch): lock_dir = tmp_path / ".mempalace" / "locks" lock_files = list(lock_dir.glob("mine_palace_*.lock")) assert lock_files, "expected the palace lock file to exist after acquire/release" - body = lock_files[0].read_text() - # One identity line, no accumulation. - assert body.count("\n") <= 1, f"lock body must not grow across re-acquires; got {body!r}" + # Read as bytes so the byte-0 sentinel (\x00) is preserved without + # decode quirks; the bound is on the file size, not its line count. + body = lock_files[0].read_bytes() + # Body is byte-0 sentinel + identity (no trailing accumulation). + # Identity is ``f"{pid} {sys.argv[:3]}"``; cap at a generous bound that + # still rules out unbounded growth across the 5 re-acquires. + assert len(body) < 1024, f"lock body must not grow across re-acquires; got {len(body)} bytes" def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):