From 7cee74c8c8c6c31acb8b363788443612e36dac77 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Thu, 30 Apr 2026 14:49:02 +0500 Subject: [PATCH 1/4] fix(fact-checker): reconfigure stdio to UTF-8 on Windows The `python -m mempalace.fact_checker --stdin` entry point reads non-ASCII text through the system ANSI codepage (cp1252/cp1251/cp950) on Windows, which mojibakes characters before claim-extraction sees them. Reconfigure stdin/stdout/stderr to UTF-8 with `errors="strict"`, wrapped in try/except so a replaced stream (Jupyter, test harness) logs a warning rather than crashing the CLI. Mirrors the same fix shipped for `mcp_server.py:main()` (#400) and `hooks_cli.py:run_hook()` (#1280) -- this is the third and last stdin-reading entry point in the package. --- mempalace/fact_checker.py | 27 +++++++++++++++++ tests/test_fact_checker.py | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index 50e8842..c894859 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -303,11 +303,38 @@ def _edit_distance(s1: str, s2: str) -> int: return prev[-1] +def _reconfigure_stdio_utf8_on_windows(): + """Decode --stdin payload as UTF-8 on Windows. + + Without this, Python defaults stdio to the system ANSI codepage + (cp1252/cp1251/cp950 depending on locale), which mojibakes + non-ASCII fact text before pattern parsing sees it. + """ + import sys + + if sys.platform != "win32": + return + for name in ("stdin", "stdout", "stderr"): + stream = getattr(sys, name, None) + reconfigure = getattr(stream, "reconfigure", None) + if reconfigure is None: + continue + try: + reconfigure(encoding="utf-8", errors="strict") + except Exception as exc: + print( + f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", + file=sys.stderr, + ) + + if __name__ == "__main__": import argparse import json import sys + _reconfigure_stdio_utf8_on_windows() + parser = argparse.ArgumentParser( description="Check text against known facts in the MemPalace palace.", epilog="Exits 0 when no issues found, 1 when one or more issues detected.", diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py index 5b34a40..9db370e 100644 --- a/tests/test_fact_checker.py +++ b/tests/test_fact_checker.py @@ -286,3 +286,63 @@ class TestCLI: assert "similar_name" in out # Silence unused import warning. _ = (MagicMock, patch, fact_checker) + + def test_reconfigures_stdio_to_utf8_on_windows(self): + """Windows fact_checker --stdin must decode payload as UTF-8. + + Without this, Python defaults stdio to the system ANSI codepage + (cp1252/cp1251/cp950), which mojibakes non-ASCII text before + pattern parsing sees it. + """ + import io + import sys + + from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows + + class _ReconfigurableStringIO(io.StringIO): + def __init__(self, initial_value=""): + super().__init__(initial_value) + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs): + self.reconfigure_calls.append(kwargs) + + stdin = _ReconfigurableStringIO() + stdout = _ReconfigurableStringIO() + stderr = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "win32"), + patch.object(sys, "stdin", stdin), + patch.object(sys, "stdout", stdout), + patch.object(sys, "stderr", stderr), + ): + _reconfigure_stdio_utf8_on_windows() + + expected = {"encoding": "utf-8", "errors": "strict"} + assert stdin.reconfigure_calls == [expected] + assert stdout.reconfigure_calls == [expected] + assert stderr.reconfigure_calls == [expected] + + def test_reconfigure_stdio_is_noop_off_windows(self): + """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams.""" + import io + import sys + + from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows + + class _ReconfigurableStringIO(io.StringIO): + def __init__(self): + super().__init__() + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs): + self.reconfigure_calls.append(kwargs) + + stdin = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "linux"), + patch.object(sys, "stdin", stdin), + ): + _reconfigure_stdio_utf8_on_windows() + + assert stdin.reconfigure_calls == [] From 32f4dfa26d25b8ff243bfd2e636f5e96d8947a83 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Thu, 30 Apr 2026 15:00:37 +0500 Subject: [PATCH 2/4] fix(cli): reconfigure stdio to UTF-8 on Windows The primary `mempalace` console_script (`cli.py:main()`) reads non-ASCII arguments via piped stdin and writes verbatim drawer text / wing names through `print()`. On Windows, Python defaults stdio to the system ANSI codepage (cp1252/cp1251/cp950), so: - `mempalace search "..." > out.txt` mojibakes any drawer text containing non-Latin characters - `mempalace ... < input.txt` mojibakes piped non-ASCII input Reconfigure stdin/stdout/stderr to UTF-8 (`errors="strict"`) at the top of `main()`, mirroring the helper added in this PR for fact_checker's `__main__` block. Wrapped in try/except so a replaced stream (Jupyter, test harness) logs a warning and continues rather than crashing the CLI. The reconfigure cascades through every `mempalace` subcommand (`init`/`mine`/`search`/`status`/`hook`/etc.) and through the interactive flows that read non-ASCII names via `input()` (onboarding, entity detector, room detector). With this commit the package's three user-facing entry points (`mempalace`, `mempalace-mcp`, and `python -m mempalace.fact_checker`) all reconfigure stdio identically on Windows. --- mempalace/cli.py | 27 ++++++++++++++++++++++++ tests/test_cli.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/mempalace/cli.py b/mempalace/cli.py index f2606a4..7372cd7 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -935,7 +935,34 @@ def cmd_compress(args): print(" (dry run -- nothing stored)") +def _reconfigure_stdio_utf8_on_windows(): + """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI. + + Without this, Python defaults stdio to the system ANSI codepage + (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII + content piped in (`mempalace search ... < query.txt`) or piped out + (`mempalace search "..." > out.txt`) when verbatim drawer text or + wing/room names contain non-Latin characters. + """ + if sys.platform != "win32": + return + for name in ("stdin", "stdout", "stderr"): + stream = getattr(sys, name, None) + reconfigure = getattr(stream, "reconfigure", None) + if reconfigure is None: + continue + try: + reconfigure(encoding="utf-8", errors="strict") + except Exception as exc: + print( + f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", + file=sys.stderr, + ) + + def main(): + _reconfigure_stdio_utf8_on_windows() + version_label = f"MemPalace {__version__}" parser = argparse.ArgumentParser( description="MemPalace — Give your AI a memory. No API key required.", diff --git a/tests/test_cli.py b/tests/test_cli.py index 328b90c..4836d69 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1042,3 +1042,55 @@ def test_cmd_repair_trailing_slash_does_not_recurse(): palace_path = os.path.expanduser(args.palace).rstrip(os.sep) backup_path = palace_path + ".backup" assert not backup_path.startswith(palace_path + os.sep) + + +# ── stdio reconfigure on Windows ───────────────────────────────────── + + +class _ReconfigurableStringIO: + def __init__(self): + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs): + self.reconfigure_calls.append(kwargs) + + +def test_reconfigures_stdio_to_utf8_on_windows(): + """Windows `mempalace` CLI must decode/encode stdio as UTF-8. + + Without this, piped non-ASCII input (`mempalace search ... < q.txt`) + or piped non-ASCII output (`mempalace search "..." > out.txt`) is + mojibaked through the system ANSI codepage on non-Latin Windows + locales (cp1252/cp1251/cp950). + """ + from mempalace.cli import _reconfigure_stdio_utf8_on_windows + + stdin = _ReconfigurableStringIO() + stdout = _ReconfigurableStringIO() + stderr = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "win32"), + patch.object(sys, "stdin", stdin), + patch.object(sys, "stdout", stdout), + patch.object(sys, "stderr", stderr), + ): + _reconfigure_stdio_utf8_on_windows() + + expected = {"encoding": "utf-8", "errors": "strict"} + assert stdin.reconfigure_calls == [expected] + assert stdout.reconfigure_calls == [expected] + assert stderr.reconfigure_calls == [expected] + + +def test_reconfigure_stdio_is_noop_off_windows(): + """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams.""" + from mempalace.cli import _reconfigure_stdio_utf8_on_windows + + stdin = _ReconfigurableStringIO() + with ( + patch.object(sys, "platform", "linux"), + patch.object(sys, "stdin", stdin), + ): + _reconfigure_stdio_utf8_on_windows() + + assert stdin.reconfigure_calls == [] From 03643eb507e4ba81c65d50b519fcfb4dfb3c769f Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sun, 3 May 2026 21:37:12 +0500 Subject: [PATCH 3/4] fix(cli, fact-checker): per-stream stdio errors policy on Windows Previously all three streams reconfigured to UTF-8 with errors='strict'. That kills 'mempalace search' the moment a drawer carrying a surrogate half (round-tripped from a filename via surrogateescape) hits print(), losing the rest of the result block. Same hazard for warning lines on stderr. Split the policy: stdin -> surrogateescape (malformed bytes from a redirected file survive as lone surrogates instead of crashing the read) stdout -> replace (drawer text with a stray surrogate becomes U+FFFD instead of UnicodeEncodeError mid-print) stderr -> replace (same protection for logger / warning paths) Applied identically in the cli.py and fact_checker.py helpers; the DRY extraction into a shared module is a separate cleanup ask, kept out of this fix to keep the diff narrow. Tests updated for the new per-stream assertion. --- mempalace/cli.py | 20 ++++++++++++++++++-- mempalace/fact_checker.py | 18 ++++++++++++++++-- tests/test_cli.py | 11 +++++++---- tests/test_fact_checker.py | 11 +++++++---- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/mempalace/cli.py b/mempalace/cli.py index 7372cd7..7052e1f 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows(): content piped in (`mempalace search ... < query.txt`) or piped out (`mempalace search "..." > out.txt`) when verbatim drawer text or wing/room names contain non-Latin characters. + + Per-stream errors policy: + stdin -- surrogateescape: malformed bytes from a redirected file + survive as lone surrogates instead of crashing the read. + stdout -- replace: ``mempalace search`` prints verbatim drawer + text. A drawer that round-tripped a filename through + surrogateescape can hold a lone surrogate, which would + otherwise raise ``UnicodeEncodeError`` mid-print and + lose the rest of the search result block. + stderr -- replace: same hazard for logger output that quotes + user-supplied path or content. """ if sys.platform != "win32": return - for name in ("stdin", "stdout", "stderr"): + policies = ( + ("stdin", "surrogateescape"), + ("stdout", "replace"), + ("stderr", "replace"), + ) + for name, errors in policies: stream = getattr(sys, name, None) reconfigure = getattr(stream, "reconfigure", None) if reconfigure is None: continue try: - reconfigure(encoding="utf-8", errors="strict") + reconfigure(encoding="utf-8", errors=errors) except Exception as exc: print( f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index c894859..1844c45 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows(): Without this, Python defaults stdio to the system ANSI codepage (cp1252/cp1251/cp950 depending on locale), which mojibakes non-ASCII fact text before pattern parsing sees it. + + Per-stream errors policy mirrors the primary CLI helper in + ``mempalace/cli.py``: + stdin -- surrogateescape: malformed input bytes survive as lone + surrogates instead of crashing the read. + stdout -- replace: extracted fact text can include surrogate + halves round-tripped from filenames; replace prevents + a UnicodeEncodeError mid-print. + stderr -- replace: same protection for warning lines. """ import sys if sys.platform != "win32": return - for name in ("stdin", "stdout", "stderr"): + policies = ( + ("stdin", "surrogateescape"), + ("stdout", "replace"), + ("stderr", "replace"), + ) + for name, errors in policies: stream = getattr(sys, name, None) reconfigure = getattr(stream, "reconfigure", None) if reconfigure is None: continue try: - reconfigure(encoding="utf-8", errors="strict") + reconfigure(encoding="utf-8", errors=errors) except Exception as exc: print( f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", diff --git a/tests/test_cli.py b/tests/test_cli.py index 4836d69..6b4b7b3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows(): ): _reconfigure_stdio_utf8_on_windows() - expected = {"encoding": "utf-8", "errors": "strict"} - assert stdin.reconfigure_calls == [expected] - assert stdout.reconfigure_calls == [expected] - assert stderr.reconfigure_calls == [expected] + # Per-stream errors policy: stdin survives bad bytes via + # surrogateescape so a redirected non-UTF-8 file does not crash + # the read; stdout/stderr use replace so a drawer carrying a + # round-tripped surrogate half does not crash mid-print. + assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}] + assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] def test_reconfigure_stdio_is_noop_off_windows(): diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py index 9db370e..89d8366 100644 --- a/tests/test_fact_checker.py +++ b/tests/test_fact_checker.py @@ -318,10 +318,13 @@ class TestCLI: ): _reconfigure_stdio_utf8_on_windows() - expected = {"encoding": "utf-8", "errors": "strict"} - assert stdin.reconfigure_calls == [expected] - assert stdout.reconfigure_calls == [expected] - assert stderr.reconfigure_calls == [expected] + # Per-stream errors policy: stdin uses surrogateescape so a stray + # malformed byte from a redirected file does not crash the read, + # stdout/stderr use replace so an extracted fact carrying a + # surrogate half does not crash mid-print. + assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}] + assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] def test_reconfigure_stdio_is_noop_off_windows(self): """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams.""" From 285b3b4f2e387c1e8eda865569a2edc400f5c1f1 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sun, 3 May 2026 22:25:31 +0500 Subject: [PATCH 4/4] refactor(stdio): extract Windows UTF-8 reconfigure into shared helper Both cli.py and fact_checker.py carried identical 28-line Windows stdio reconfigure helpers; pull the loop into mempalace/_stdio.py so the same machine drives the CLI, the fact_checker --stdin entry point, and the MCP server. The thin per-call-site wrappers stay so existing tests keep importing _reconfigure_stdio_utf8_on_windows from the same module they always have. CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash on a malformed redirected file), stdout/stderr=replace (don't crash mid-print on a surrogate half round-tripped from a filename). --- mempalace/_stdio.py | 71 +++++++++++++++++++++++++++++++++++++++ mempalace/cli.py | 45 ++++++------------------- mempalace/fact_checker.py | 39 ++++----------------- 3 files changed, 88 insertions(+), 67 deletions(-) create mode 100644 mempalace/_stdio.py diff --git a/mempalace/_stdio.py b/mempalace/_stdio.py new file mode 100644 index 0000000..13e9509 --- /dev/null +++ b/mempalace/_stdio.py @@ -0,0 +1,71 @@ +"""Stdio UTF-8 reconfiguration helper for Windows entry points. + +Python on Windows defaults stdio to the system ANSI codepage +(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input +or output the moment a non-Latin character shows up. Every console +entry point that touches stdio needs to fix this on Windows -- the MCP +server, the CLI, the fact_checker `--stdin` mode -- so the +reconfigure code lives here in one place to keep the per-stream +errors policies aligned across them. + +Per-stream errors policy is caller-chosen: + +* MCP server uses ``strict`` on stdout/stderr because everything written + there is server-controlled JSON-RPC; any encode failure is a real bug + the operator wants loud. +* CLI / fact_checker use ``replace`` on stdout/stderr because they print + verbatim drawer text that may contain surrogate halves round-tripped + from filenames -- ``strict`` would crash mid-print. +* All callers use ``surrogateescape`` on stdin so a malformed byte from + a redirected file or a misbehaving client survives as a lone surrogate + the consumer's parser surfaces, instead of ``UnicodeDecodeError`` + killing the read loop on the first bad byte. +""" + +from __future__ import annotations + +import sys +from typing import Callable, Optional + + +def reconfigure_stdio_utf8_on_windows( + *, + stdin_errors: str = "surrogateescape", + stdout_errors: str = "strict", + stderr_errors: str = "strict", + on_failure: Optional[Callable[[str, BaseException], None]] = None, +) -> None: + """Reconfigure stdio to UTF-8 on Windows. No-op elsewhere. + + Args: + stdin_errors: errors= policy for stdin.reconfigure(). + stdout_errors: errors= policy for stdout.reconfigure(). + stderr_errors: errors= policy for stderr.reconfigure(). + on_failure: optional ``(stream_name, exc) -> None`` callback for + streams whose ``reconfigure`` raises (e.g. Jupyter-replaced + streams that lack the method-shape we expect). Defaults to a + ``WARNING:`` line on the original sys.stderr. + """ + if sys.platform != "win32": + return + + policies = ( + ("stdin", stdin_errors), + ("stdout", stdout_errors), + ("stderr", stderr_errors), + ) + for name, errors in policies: + stream = getattr(sys, name, None) + reconfigure = getattr(stream, "reconfigure", None) + if reconfigure is None: + continue + try: + reconfigure(encoding="utf-8", errors=errors) + except Exception as exc: # noqa: BLE001 -- last-resort guard + if on_failure is not None: + on_failure(name, exc) + else: + print( + f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", + file=sys.stderr, + ) diff --git a/mempalace/cli.py b/mempalace/cli.py index 7052e1f..0ab3d0f 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -938,42 +938,17 @@ def cmd_compress(args): def _reconfigure_stdio_utf8_on_windows(): """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI. - Without this, Python defaults stdio to the system ANSI codepage - (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII - content piped in (`mempalace search ... < query.txt`) or piped out - (`mempalace search "..." > out.txt`) when verbatim drawer text or - wing/room names contain non-Latin characters. - - Per-stream errors policy: - stdin -- surrogateescape: malformed bytes from a redirected file - survive as lone surrogates instead of crashing the read. - stdout -- replace: ``mempalace search`` prints verbatim drawer - text. A drawer that round-tripped a filename through - surrogateescape can hold a lone surrogate, which would - otherwise raise ``UnicodeEncodeError`` mid-print and - lose the rest of the search result block. - stderr -- replace: same hazard for logger output that quotes - user-supplied path or content. + Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI + overrides stdout/stderr to ``replace`` because ``mempalace search`` + prints verbatim drawer text that may carry surrogate halves + round-tripped from filenames -- ``strict`` would crash mid-print and + lose the rest of the search result block. stdin keeps the default + ``surrogateescape`` so a redirected non-UTF-8 file does not kill the + read on the first bad byte. """ - if sys.platform != "win32": - return - policies = ( - ("stdin", "surrogateescape"), - ("stdout", "replace"), - ("stderr", "replace"), - ) - for name, errors in policies: - stream = getattr(sys, name, None) - reconfigure = getattr(stream, "reconfigure", None) - if reconfigure is None: - continue - try: - reconfigure(encoding="utf-8", errors=errors) - except Exception as exc: - print( - f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", - file=sys.stderr, - ) + from ._stdio import reconfigure_stdio_utf8_on_windows + + reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace") def main(): diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index 1844c45..403d913 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int: def _reconfigure_stdio_utf8_on_windows(): """Decode --stdin payload as UTF-8 on Windows. - Without this, Python defaults stdio to the system ANSI codepage - (cp1252/cp1251/cp950 depending on locale), which mojibakes - non-ASCII fact text before pattern parsing sees it. - - Per-stream errors policy mirrors the primary CLI helper in - ``mempalace/cli.py``: - stdin -- surrogateescape: malformed input bytes survive as lone - surrogates instead of crashing the read. - stdout -- replace: extracted fact text can include surrogate - halves round-tripped from filenames; replace prevents - a UnicodeEncodeError mid-print. - stderr -- replace: same protection for warning lines. + Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors + the primary CLI policy: stdout/stderr use ``replace`` because + extracted fact text can include surrogate halves round-tripped from + filenames -- ``strict`` would raise UnicodeEncodeError mid-print. + stdin keeps the default ``surrogateescape``. """ - import sys + from ._stdio import reconfigure_stdio_utf8_on_windows - if sys.platform != "win32": - return - policies = ( - ("stdin", "surrogateescape"), - ("stdout", "replace"), - ("stderr", "replace"), - ) - for name, errors in policies: - stream = getattr(sys, name, None) - reconfigure = getattr(stream, "reconfigure", None) - if reconfigure is None: - continue - try: - reconfigure(encoding="utf-8", errors=errors) - except Exception as exc: - print( - f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", - file=sys.stderr, - ) + reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace") if __name__ == "__main__":