Skip to content

Commit 3824ea6

Browse files
authored
Merge pull request #1282 from mvalentsev/fix/fact-checker-stdio-utf8
fix(cli, fact-checker): reconfigure stdio to UTF-8 on Windows
2 parents 778f830 + 285b3b4 commit 3824ea6

5 files changed

Lines changed: 223 additions & 0 deletions

File tree

mempalace/_stdio.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Stdio UTF-8 reconfiguration helper for Windows entry points.
2+
3+
Python on Windows defaults stdio to the system ANSI codepage
4+
(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
5+
or output the moment a non-Latin character shows up. Every console
6+
entry point that touches stdio needs to fix this on Windows -- the MCP
7+
server, the CLI, the fact_checker `--stdin` mode -- so the
8+
reconfigure code lives here in one place to keep the per-stream
9+
errors policies aligned across them.
10+
11+
Per-stream errors policy is caller-chosen:
12+
13+
* MCP server uses ``strict`` on stdout/stderr because everything written
14+
there is server-controlled JSON-RPC; any encode failure is a real bug
15+
the operator wants loud.
16+
* CLI / fact_checker use ``replace`` on stdout/stderr because they print
17+
verbatim drawer text that may contain surrogate halves round-tripped
18+
from filenames -- ``strict`` would crash mid-print.
19+
* All callers use ``surrogateescape`` on stdin so a malformed byte from
20+
a redirected file or a misbehaving client survives as a lone surrogate
21+
the consumer's parser surfaces, instead of ``UnicodeDecodeError``
22+
killing the read loop on the first bad byte.
23+
"""
24+
25+
from __future__ import annotations
26+
27+
import sys
28+
from typing import Callable, Optional
29+
30+
31+
def reconfigure_stdio_utf8_on_windows(
32+
*,
33+
stdin_errors: str = "surrogateescape",
34+
stdout_errors: str = "strict",
35+
stderr_errors: str = "strict",
36+
on_failure: Optional[Callable[[str, BaseException], None]] = None,
37+
) -> None:
38+
"""Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
39+
40+
Args:
41+
stdin_errors: errors= policy for stdin.reconfigure().
42+
stdout_errors: errors= policy for stdout.reconfigure().
43+
stderr_errors: errors= policy for stderr.reconfigure().
44+
on_failure: optional ``(stream_name, exc) -> None`` callback for
45+
streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
46+
streams that lack the method-shape we expect). Defaults to a
47+
``WARNING:`` line on the original sys.stderr.
48+
"""
49+
if sys.platform != "win32":
50+
return
51+
52+
policies = (
53+
("stdin", stdin_errors),
54+
("stdout", stdout_errors),
55+
("stderr", stderr_errors),
56+
)
57+
for name, errors in policies:
58+
stream = getattr(sys, name, None)
59+
reconfigure = getattr(stream, "reconfigure", None)
60+
if reconfigure is None:
61+
continue
62+
try:
63+
reconfigure(encoding="utf-8", errors=errors)
64+
except Exception as exc: # noqa: BLE001 -- last-resort guard
65+
if on_failure is not None:
66+
on_failure(name, exc)
67+
else:
68+
print(
69+
f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
70+
file=sys.stderr,
71+
)

mempalace/cli.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,7 +935,25 @@ def cmd_compress(args):
935935
print(" (dry run -- nothing stored)")
936936

937937

938+
def _reconfigure_stdio_utf8_on_windows():
939+
"""Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
940+
941+
Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
942+
overrides stdout/stderr to ``replace`` because ``mempalace search``
943+
prints verbatim drawer text that may carry surrogate halves
944+
round-tripped from filenames -- ``strict`` would crash mid-print and
945+
lose the rest of the search result block. stdin keeps the default
946+
``surrogateescape`` so a redirected non-UTF-8 file does not kill the
947+
read on the first bad byte.
948+
"""
949+
from ._stdio import reconfigure_stdio_utf8_on_windows
950+
951+
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
952+
953+
938954
def main():
955+
_reconfigure_stdio_utf8_on_windows()
956+
939957
version_label = f"MemPalace {__version__}"
940958
parser = argparse.ArgumentParser(
941959
description="MemPalace — Give your AI a memory. No API key required.",

mempalace/fact_checker.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,11 +303,27 @@ def _edit_distance(s1: str, s2: str) -> int:
303303
return prev[-1]
304304

305305

306+
def _reconfigure_stdio_utf8_on_windows():
307+
"""Decode --stdin payload as UTF-8 on Windows.
308+
309+
Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
310+
the primary CLI policy: stdout/stderr use ``replace`` because
311+
extracted fact text can include surrogate halves round-tripped from
312+
filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
313+
stdin keeps the default ``surrogateescape``.
314+
"""
315+
from ._stdio import reconfigure_stdio_utf8_on_windows
316+
317+
reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
318+
319+
306320
if __name__ == "__main__":
307321
import argparse
308322
import json
309323
import sys
310324

325+
_reconfigure_stdio_utf8_on_windows()
326+
311327
parser = argparse.ArgumentParser(
312328
description="Check text against known facts in the MemPalace palace.",
313329
epilog="Exits 0 when no issues found, 1 when one or more issues detected.",

tests/test_cli.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,3 +1042,58 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
10421042
palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
10431043
backup_path = palace_path + ".backup"
10441044
assert not backup_path.startswith(palace_path + os.sep)
1045+
1046+
1047+
# ── stdio reconfigure on Windows ─────────────────────────────────────
1048+
1049+
1050+
class _ReconfigurableStringIO:
1051+
def __init__(self):
1052+
self.reconfigure_calls = []
1053+
1054+
def reconfigure(self, **kwargs):
1055+
self.reconfigure_calls.append(kwargs)
1056+
1057+
1058+
def test_reconfigures_stdio_to_utf8_on_windows():
1059+
"""Windows `mempalace` CLI must decode/encode stdio as UTF-8.
1060+
1061+
Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
1062+
or piped non-ASCII output (`mempalace search "..." > out.txt`) is
1063+
mojibaked through the system ANSI codepage on non-Latin Windows
1064+
locales (cp1252/cp1251/cp950).
1065+
"""
1066+
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
1067+
1068+
stdin = _ReconfigurableStringIO()
1069+
stdout = _ReconfigurableStringIO()
1070+
stderr = _ReconfigurableStringIO()
1071+
with (
1072+
patch.object(sys, "platform", "win32"),
1073+
patch.object(sys, "stdin", stdin),
1074+
patch.object(sys, "stdout", stdout),
1075+
patch.object(sys, "stderr", stderr),
1076+
):
1077+
_reconfigure_stdio_utf8_on_windows()
1078+
1079+
# Per-stream errors policy: stdin survives bad bytes via
1080+
# surrogateescape so a redirected non-UTF-8 file does not crash
1081+
# the read; stdout/stderr use replace so a drawer carrying a
1082+
# round-tripped surrogate half does not crash mid-print.
1083+
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
1084+
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
1085+
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
1086+
1087+
1088+
def test_reconfigure_stdio_is_noop_off_windows():
1089+
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
1090+
from mempalace.cli import _reconfigure_stdio_utf8_on_windows
1091+
1092+
stdin = _ReconfigurableStringIO()
1093+
with (
1094+
patch.object(sys, "platform", "linux"),
1095+
patch.object(sys, "stdin", stdin),
1096+
):
1097+
_reconfigure_stdio_utf8_on_windows()
1098+
1099+
assert stdin.reconfigure_calls == []

tests/test_fact_checker.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,66 @@ def test_exits_nonzero_when_issues_found(self, tmp_path, monkeypatch, capsys):
286286
assert "similar_name" in out
287287
# Silence unused import warning.
288288
_ = (MagicMock, patch, fact_checker)
289+
290+
def test_reconfigures_stdio_to_utf8_on_windows(self):
291+
"""Windows fact_checker --stdin must decode payload as UTF-8.
292+
293+
Without this, Python defaults stdio to the system ANSI codepage
294+
(cp1252/cp1251/cp950), which mojibakes non-ASCII text before
295+
pattern parsing sees it.
296+
"""
297+
import io
298+
import sys
299+
300+
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
301+
302+
class _ReconfigurableStringIO(io.StringIO):
303+
def __init__(self, initial_value=""):
304+
super().__init__(initial_value)
305+
self.reconfigure_calls = []
306+
307+
def reconfigure(self, **kwargs):
308+
self.reconfigure_calls.append(kwargs)
309+
310+
stdin = _ReconfigurableStringIO()
311+
stdout = _ReconfigurableStringIO()
312+
stderr = _ReconfigurableStringIO()
313+
with (
314+
patch.object(sys, "platform", "win32"),
315+
patch.object(sys, "stdin", stdin),
316+
patch.object(sys, "stdout", stdout),
317+
patch.object(sys, "stderr", stderr),
318+
):
319+
_reconfigure_stdio_utf8_on_windows()
320+
321+
# Per-stream errors policy: stdin uses surrogateescape so a stray
322+
# malformed byte from a redirected file does not crash the read,
323+
# stdout/stderr use replace so an extracted fact carrying a
324+
# surrogate half does not crash mid-print.
325+
assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
326+
assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
327+
assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
328+
329+
def test_reconfigure_stdio_is_noop_off_windows(self):
330+
"""Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
331+
import io
332+
import sys
333+
334+
from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
335+
336+
class _ReconfigurableStringIO(io.StringIO):
337+
def __init__(self):
338+
super().__init__()
339+
self.reconfigure_calls = []
340+
341+
def reconfigure(self, **kwargs):
342+
self.reconfigure_calls.append(kwargs)
343+
344+
stdin = _ReconfigurableStringIO()
345+
with (
346+
patch.object(sys, "platform", "linux"),
347+
patch.object(sys, "stdin", stdin),
348+
):
349+
_reconfigure_stdio_utf8_on_windows()
350+
351+
assert stdin.reconfigure_calls == []

0 commit comments

Comments
 (0)