From 285b3b4f2e387c1e8eda865569a2edc400f5c1f1 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Sun, 3 May 2026 22:25:31 +0500 Subject: [PATCH] refactor(stdio): extract Windows UTF-8 reconfigure into shared helper Both cli.py and fact_checker.py carried identical 28-line Windows stdio reconfigure helpers; pull the loop into mempalace/_stdio.py so the same machine drives the CLI, the fact_checker --stdin entry point, and the MCP server. The thin per-call-site wrappers stay so existing tests keep importing _reconfigure_stdio_utf8_on_windows from the same module they always have. CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash on a malformed redirected file), stdout/stderr=replace (don't crash mid-print on a surrogate half round-tripped from a filename). --- mempalace/_stdio.py | 71 +++++++++++++++++++++++++++++++++++++++ mempalace/cli.py | 45 ++++++------------------- mempalace/fact_checker.py | 39 ++++----------------- 3 files changed, 88 insertions(+), 67 deletions(-) create mode 100644 mempalace/_stdio.py diff --git a/mempalace/_stdio.py b/mempalace/_stdio.py new file mode 100644 index 0000000..13e9509 --- /dev/null +++ b/mempalace/_stdio.py @@ -0,0 +1,71 @@ +"""Stdio UTF-8 reconfiguration helper for Windows entry points. + +Python on Windows defaults stdio to the system ANSI codepage +(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input +or output the moment a non-Latin character shows up. Every console +entry point that touches stdio needs to fix this on Windows -- the MCP +server, the CLI, the fact_checker `--stdin` mode -- so the +reconfigure code lives here in one place to keep the per-stream +errors policies aligned across them. + +Per-stream errors policy is caller-chosen: + +* MCP server uses ``strict`` on stdout/stderr because everything written + there is server-controlled JSON-RPC; any encode failure is a real bug + the operator wants loud. +* CLI / fact_checker use ``replace`` on stdout/stderr because they print + verbatim drawer text that may contain surrogate halves round-tripped + from filenames -- ``strict`` would crash mid-print. +* All callers use ``surrogateescape`` on stdin so a malformed byte from + a redirected file or a misbehaving client survives as a lone surrogate + the consumer's parser surfaces, instead of ``UnicodeDecodeError`` + killing the read loop on the first bad byte. +""" + +from __future__ import annotations + +import sys +from typing import Callable, Optional + + +def reconfigure_stdio_utf8_on_windows( + *, + stdin_errors: str = "surrogateescape", + stdout_errors: str = "strict", + stderr_errors: str = "strict", + on_failure: Optional[Callable[[str, BaseException], None]] = None, +) -> None: + """Reconfigure stdio to UTF-8 on Windows. No-op elsewhere. + + Args: + stdin_errors: errors= policy for stdin.reconfigure(). + stdout_errors: errors= policy for stdout.reconfigure(). + stderr_errors: errors= policy for stderr.reconfigure(). + on_failure: optional ``(stream_name, exc) -> None`` callback for + streams whose ``reconfigure`` raises (e.g. Jupyter-replaced + streams that lack the method-shape we expect). Defaults to a + ``WARNING:`` line on the original sys.stderr. + """ + if sys.platform != "win32": + return + + policies = ( + ("stdin", stdin_errors), + ("stdout", stdout_errors), + ("stderr", stderr_errors), + ) + for name, errors in policies: + stream = getattr(sys, name, None) + reconfigure = getattr(stream, "reconfigure", None) + if reconfigure is None: + continue + try: + reconfigure(encoding="utf-8", errors=errors) + except Exception as exc: # noqa: BLE001 -- last-resort guard + if on_failure is not None: + on_failure(name, exc) + else: + print( + f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", + file=sys.stderr, + ) diff --git a/mempalace/cli.py b/mempalace/cli.py index 7052e1f..0ab3d0f 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -938,42 +938,17 @@ def cmd_compress(args): def _reconfigure_stdio_utf8_on_windows(): """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI. - Without this, Python defaults stdio to the system ANSI codepage - (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII - content piped in (`mempalace search ... < query.txt`) or piped out - (`mempalace search "..." > out.txt`) when verbatim drawer text or - wing/room names contain non-Latin characters. - - Per-stream errors policy: - stdin -- surrogateescape: malformed bytes from a redirected file - survive as lone surrogates instead of crashing the read. - stdout -- replace: ``mempalace search`` prints verbatim drawer - text. A drawer that round-tripped a filename through - surrogateescape can hold a lone surrogate, which would - otherwise raise ``UnicodeEncodeError`` mid-print and - lose the rest of the search result block. - stderr -- replace: same hazard for logger output that quotes - user-supplied path or content. + Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI + overrides stdout/stderr to ``replace`` because ``mempalace search`` + prints verbatim drawer text that may carry surrogate halves + round-tripped from filenames -- ``strict`` would crash mid-print and + lose the rest of the search result block. stdin keeps the default + ``surrogateescape`` so a redirected non-UTF-8 file does not kill the + read on the first bad byte. """ - if sys.platform != "win32": - return - policies = ( - ("stdin", "surrogateescape"), - ("stdout", "replace"), - ("stderr", "replace"), - ) - for name, errors in policies: - stream = getattr(sys, name, None) - reconfigure = getattr(stream, "reconfigure", None) - if reconfigure is None: - continue - try: - reconfigure(encoding="utf-8", errors=errors) - except Exception as exc: - print( - f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", - file=sys.stderr, - ) + from ._stdio import reconfigure_stdio_utf8_on_windows + + reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace") def main(): diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py index 1844c45..403d913 100644 --- a/mempalace/fact_checker.py +++ b/mempalace/fact_checker.py @@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int: def _reconfigure_stdio_utf8_on_windows(): """Decode --stdin payload as UTF-8 on Windows. - Without this, Python defaults stdio to the system ANSI codepage - (cp1252/cp1251/cp950 depending on locale), which mojibakes - non-ASCII fact text before pattern parsing sees it. - - Per-stream errors policy mirrors the primary CLI helper in - ``mempalace/cli.py``: - stdin -- surrogateescape: malformed input bytes survive as lone - surrogates instead of crashing the read. - stdout -- replace: extracted fact text can include surrogate - halves round-tripped from filenames; replace prevents - a UnicodeEncodeError mid-print. - stderr -- replace: same protection for warning lines. + Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors + the primary CLI policy: stdout/stderr use ``replace`` because + extracted fact text can include surrogate halves round-tripped from + filenames -- ``strict`` would raise UnicodeEncodeError mid-print. + stdin keeps the default ``surrogateescape``. """ - import sys + from ._stdio import reconfigure_stdio_utf8_on_windows - if sys.platform != "win32": - return - policies = ( - ("stdin", "surrogateescape"), - ("stdout", "replace"), - ("stderr", "replace"), - ) - for name, errors in policies: - stream = getattr(sys, name, None) - reconfigure = getattr(stream, "reconfigure", None) - if reconfigure is None: - continue - try: - reconfigure(encoding="utf-8", errors=errors) - except Exception as exc: - print( - f"WARNING: Could not reconfigure {name} to UTF-8: {exc}", - file=sys.stderr, - ) + reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace") if __name__ == "__main__":