Merge pull request #1282 from mvalentsev/fix/fact-checker-stdio-utf8

fix(cli, fact-checker): reconfigure stdio to UTF-8 on Windows
This commit is contained in:
Igor Lins e Silva
2026-05-06 01:33:15 -03:00
committed by GitHub
5 changed files with 223 additions and 0 deletions
+55
View File
@@ -1042,3 +1042,58 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
backup_path = palace_path + ".backup"
assert not backup_path.startswith(palace_path + os.sep)
# ── stdio reconfigure on Windows ─────────────────────────────────────
class _ReconfigurableStringIO:
def __init__(self):
self.reconfigure_calls = []
def reconfigure(self, **kwargs):
self.reconfigure_calls.append(kwargs)
def test_reconfigures_stdio_to_utf8_on_windows():
"""Windows `mempalace` CLI must decode/encode stdio as UTF-8.
Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
or piped non-ASCII output (`mempalace search "..." > out.txt`) is
mojibaked through the system ANSI codepage on non-Latin Windows
locales (cp1252/cp1251/cp950).
"""
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
stdin = _ReconfigurableStringIO()
stdout = _ReconfigurableStringIO()
stderr = _ReconfigurableStringIO()
with (
patch.object(sys, "platform", "win32"),
patch.object(sys, "stdin", stdin),
patch.object(sys, "stdout", stdout),
patch.object(sys, "stderr", stderr),
):
_reconfigure_stdio_utf8_on_windows()
# Per-stream errors policy: stdin survives bad bytes via
# surrogateescape so a redirected non-UTF-8 file does not crash
# the read; stdout/stderr use replace so a drawer carrying a
# round-tripped surrogate half does not crash mid-print.
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
def test_reconfigure_stdio_is_noop_off_windows():
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
stdin = _ReconfigurableStringIO()
with (
patch.object(sys, "platform", "linux"),
patch.object(sys, "stdin", stdin),
):
_reconfigure_stdio_utf8_on_windows()
assert stdin.reconfigure_calls == []
+63
View File
@@ -286,3 +286,66 @@ class TestCLI:
assert "similar_name" in out
# Silence unused import warning.
_ = (MagicMock, patch, fact_checker)
def test_reconfigures_stdio_to_utf8_on_windows(self):
"""Windows fact_checker --stdin must decode payload as UTF-8.
Without this, Python defaults stdio to the system ANSI codepage
(cp1252/cp1251/cp950), which mojibakes non-ASCII text before
pattern parsing sees it.
"""
import io
import sys
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
class _ReconfigurableStringIO(io.StringIO):
def __init__(self, initial_value=""):
super().__init__(initial_value)
self.reconfigure_calls = []
def reconfigure(self, **kwargs):
self.reconfigure_calls.append(kwargs)
stdin = _ReconfigurableStringIO()
stdout = _ReconfigurableStringIO()
stderr = _ReconfigurableStringIO()
with (
patch.object(sys, "platform", "win32"),
patch.object(sys, "stdin", stdin),
patch.object(sys, "stdout", stdout),
patch.object(sys, "stderr", stderr),
):
_reconfigure_stdio_utf8_on_windows()
# Per-stream errors policy: stdin uses surrogateescape so a stray
# malformed byte from a redirected file does not crash the read,
# stdout/stderr use replace so an extracted fact carrying a
# surrogate half does not crash mid-print.
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
def test_reconfigure_stdio_is_noop_off_windows(self):
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
import io
import sys
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
class _ReconfigurableStringIO(io.StringIO):
def __init__(self):
super().__init__()
self.reconfigure_calls = []
def reconfigure(self, **kwargs):
self.reconfigure_calls.append(kwargs)
stdin = _ReconfigurableStringIO()
with (
patch.object(sys, "platform", "linux"),
patch.object(sys, "stdin", stdin),
):
_reconfigure_stdio_utf8_on_windows()
assert stdin.reconfigure_calls == []