From d76df23354345242d4338660527ead66056d3e4d Mon Sep 17 00:00:00 2001
From: felipefl142 <felipefrl1@hotmail.com>
Date: Sun, 19 Apr 2026 17:34:40 -0300
Subject: [PATCH 1/2] feat: send screenshots in prompt, add vision toggle,
 LLM-driven blind select

- Embed a pre-call screenshot (base64 image_url) in the LLM prompt so
  vision-capable models see the current board. Screenshot still lands in
  `screenshots/{custom_id}.png` via Collector.peek_next_custom_id.
- Add `vision` config (default true) + `--no-vision` CLI flag +
  `BALATROLLM_VISION` env var for text-only models (e.g. Ollama).
- LLMClient auto-detects 404 "image input" errors, strips image blocks,
  and retries once; subsequent calls skip screenshots for the session.
- Hand BLIND_SELECT to the LLM instead of auto-selecting, so strategies
  can choose to skip blinds via the existing `skip` tool.
- Pass `tool_choice="required"` so models must emit a tool call rather
  than prose.
- Add `_to_wine_path` helper so the `screenshot` RPC receives a
  Windows-style path when the game runs under Wine/Proton.
---
 src/balatrollm/bot.py       | 78 +++++++++++++++++++++++++------------
 src/balatrollm/cli.py       |  4 ++
 src/balatrollm/collector.py |  4 ++
 src/balatrollm/config.py    |  6 ++-
 src/balatrollm/llm.py       | 46 +++++++++++++++++++++-
 5 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/src/balatrollm/bot.py b/src/balatrollm/bot.py
index 1e4a2c4..1939fdc 100644
--- a/src/balatrollm/bot.py
+++ b/src/balatrollm/bot.py
@@ -1,6 +1,7 @@
 """Core LLM-powered Balatro bot implementation."""
 
 import asyncio
+import base64
 import json
 import logging
 import time
@@ -25,6 +26,23 @@
 logger = logging.getLogger(__name__)
 
 
+def _to_wine_path(path: Path) -> str:
+    """Convert a Linux path under Wine's drive_c to a Windows-style path.
+
+    Balatro (via BalatroBot Lua mod) runs inside Wine/Proton and can only write
+    to Windows-style paths. This converts e.g.:
+        /home/user/.../drive_c/users/foo/bar.png
+        -> C:\\users\\foo\\bar.png
+    Falls back to the original string if drive_c is not in the path.
+    """
+    parts = path.parts
+    try:
+        idx = next(i for i, p in enumerate(parts) if p == "drive_c")
+        return "C:\\" + "\\".join(parts[idx + 1 :])
+    except StopIteration:
+        return str(path)
+
+
 class BotError(Exception):
     """Base exception for bot errors."""
 
@@ -66,6 +84,7 @@ async def __aenter__(self) -> "Bot":
         self._llm = LLMClient(
             base_url=self.config.base_url,
             api_key=self.config.api_key or "",
+            vision=self.config.vision,
         )
         await self._llm.__aenter__()
 
@@ -198,14 +217,11 @@ async def _run_game_loop(self, gamestate: dict[str, Any]) -> None:
             await self._balatro.call("gamestate")
 
             match current_state:
-                case "SELECTING_HAND" | "SHOP" | "SMODS_BOOSTER_OPENED":
+                case "SELECTING_HAND" | "SHOP" | "SMODS_BOOSTER_OPENED" | "BLIND_SELECT":
                     response = await self._get_llm_response(gamestate)
                     gamestate = await self._execute_tool_call(response)
                 case "ROUND_EVAL":
                     gamestate = await self._balatro.call("cash_out")
-                case "BLIND_SELECT":
-                    # NOTE: This bot always selects and never skips blinds
-                    gamestate = await self._balatro.call("select")
                 case "GAME_OVER":
                     self._finish_reason = "lost"
                     logger.info("Game over!")
@@ -220,6 +236,20 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion:
         assert self._llm is not None
         assert self._collector is not None
 
+        # Take screenshot BEFORE building the request so it can be included in the prompt
+        next_custom_id = self._collector.peek_next_custom_id()
+        screenshot_path = self._collector.screenshot_dir / f"{next_custom_id}.png"
+        screenshot_b64: str | None = None
+        try:
+            await self._balatro.call(
+                "screenshot", {"path": _to_wine_path(screenshot_path)}
+            )
+            screenshot_b64 = base64.b64encode(screenshot_path.read_bytes()).decode()
+        except BalatroError as e:
+            logger.warning(f"Screenshot failed: {e}")
+        except Exception as e:
+            logger.warning(f"Screenshot read failed: {e}")
+
         strategy_content = self.strategy.render_strategy(gamestate)
         gamestate_content = self.strategy.render_gamestate(gamestate)
         memory_content = self.strategy.render_memory(
@@ -228,20 +258,28 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion:
             last_failure=self._last_failed_msg,
         )
 
-        messages = [
+        content: list[dict[str, Any]] = [
             {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": strategy_content,
-                        "cache_control": {"type": "ephemeral"},
-                    },
-                    {"type": "text", "text": gamestate_content},
-                    {"type": "text", "text": memory_content},
-                ],
-            }
+                "type": "text",
+                "text": strategy_content,
+                "cache_control": {"type": "ephemeral"},
+            },
         ]
+        if screenshot_b64:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
+                }
+            )
+        content.extend(
+            [
+                {"type": "text", "text": gamestate_content},
+                {"type": "text", "text": memory_content},
+            ]
+        )
+
+        messages = [{"role": "user", "content": content}]
 
         tools = self.strategy.get_tools(gamestate["state"])
 
@@ -263,14 +301,6 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion:
                 model_config=self.model_config,
             )
 
-            try:
-                await self._balatro.call(
-                    "screenshot",
-                    {"path": str(self._collector.screenshot_dir / f"{custom_id}.png")},
-                )
-            except BalatroError as e:
-                logger.warning(f"Screenshot failed: {e}")
-
             self._collector.write_response(
                 id=str(time.time_ns() // 1_000_000),
                 custom_id=custom_id,
diff --git a/src/balatrollm/cli.py b/src/balatrollm/cli.py
index 6557e3b..c5fae9f 100644
--- a/src/balatrollm/cli.py
+++ b/src/balatrollm/cli.py
@@ -48,6 +48,10 @@ def create_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--views", action="store_true", help="Start HTTP server on port 12345 for views"
     )
+    parser.add_argument(
+        "--no-vision", dest="vision", action="store_false", default=None,
+        help="Disable screenshots (required for non-vision models via Ollama)",
+    )
 
     return parser
 
diff --git a/src/balatrollm/collector.py b/src/balatrollm/collector.py
index 90a98f8..9d87f2d 100644
--- a/src/balatrollm/collector.py
+++ b/src/balatrollm/collector.py
@@ -247,6 +247,10 @@ def _write_latest_json(self) -> None:
                 f,
             )
 
+    def peek_next_custom_id(self) -> str:
+        """Return the custom_id that the next write_request call will use, without advancing the counter."""
+        return f"request-{self._request_count + 1:05}"
+
     def write_request(self, body: dict[str, Any]) -> str:
         """Write request to requests.jsonl. Returns custom_id."""
         self._request_count += 1
diff --git a/src/balatrollm/config.py b/src/balatrollm/config.py
index 8e04f7e..4f3d84f 100644
--- a/src/balatrollm/config.py
+++ b/src/balatrollm/config.py
@@ -41,13 +41,14 @@
     "base_url": "BALATROLLM_BASE_URL",
     "api_key": "BALATROLLM_API_KEY",
     "views": "BALATROLLM_VIEWS",
+    "vision": "BALATROLLM_VISION",
 }
 
 ################################################################################
 # Types for config conversion
 ################################################################################
 
-BOOL_FIELDS: frozenset[str] = frozenset({"views"})
+BOOL_FIELDS: frozenset[str] = frozenset({"views", "vision"})
 LIST_FIELDS: frozenset[str] = frozenset({"model", "seed", "deck", "stake", "strategy"})
 STRING_FIELDS: frozenset[str] = frozenset({"host", "base_url", "api_key"})
 INT_FIELDS: frozenset[str] = frozenset({"parallel", "port"})
@@ -149,6 +150,8 @@ def _load_from_args(args: Namespace) -> dict[str, Any]:
     for field_name in BOOL_FIELDS:
         if getattr(args, field_name, False):
             result[field_name] = True
+    if getattr(args, "vision", None) is False:
+        result["vision"] = False
     return result
 
 
@@ -183,6 +186,7 @@ class Config:
     # Execution
     parallel: int = 1
     views: bool = False
+    vision: bool = True
 
     # Connection
     host: str = "127.0.0.1"
diff --git a/src/balatrollm/llm.py b/src/balatrollm/llm.py
index d307eaf..ef40ca3 100644
--- a/src/balatrollm/llm.py
+++ b/src/balatrollm/llm.py
@@ -30,6 +30,19 @@ class LLMRetryExhaustedError(LLMClientError):
     pass
 
 
+def _strip_image_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Remove image_url blocks from message content lists."""
+    result = []
+    for msg in messages:
+        content = msg.get("content")
+        if isinstance(content, list):
+            filtered = [b for b in content if b.get("type") != "image_url"]
+            result.append({**msg, "content": filtered})
+        else:
+            result.append(msg)
+    return result
+
+
 @dataclass
 class LLMClient:
     """Async OpenAI client wrapper with retry logic."""
@@ -38,9 +51,11 @@ class LLMClient:
     api_key: str
     timeout: float = 240.0  # We assume that LLMs respond in 240s
     max_retries: int = 3
+    vision: bool = True
 
     _client: openai.AsyncOpenAI | None = field(default=None, init=False, repr=False)
     _consecutive_timeouts: int = field(default=0, init=False, repr=False)
+    _vision_supported: bool = field(default=True, init=False, repr=False)
 
     async def __aenter__(self) -> "LLMClient":
         """Create the async OpenAI client."""
@@ -50,6 +65,7 @@ async def __aenter__(self) -> "LLMClient":
             timeout=self.timeout,
         )
         self._consecutive_timeouts = 0
+        self._vision_supported = self.vision
         return self
 
     async def __aexit__(self, *_: Any) -> None:
@@ -71,10 +87,15 @@ async def call(
                 "Client not connected. Use 'async with LLMClient() as client:'"
             )
 
+        effective_messages = (
+            _strip_image_content(messages) if not self._vision_supported else messages
+        )
+
         request_data: dict[str, Any] = {
             "model": model,
-            "messages": messages,
+            "messages": effective_messages,
             "tools": tools,
+            "tool_choice": "required",
         }
 
         if model_config:
@@ -83,6 +104,7 @@ async def call(
 
         retry_delay = 1.0
         last_exception: Exception | None = None
+        vision_stripped = False
 
         for attempt in range(self.max_retries):
             try:
@@ -108,6 +130,23 @@ async def call(
                 last_exception = e
 
             except openai.APIStatusError as e:
+                if e.status_code == 404 and "image input" in str(e).lower() and not vision_stripped:
+                    logger.warning("Model does not support vision — disabling screenshots for this session")
+                    self._vision_supported = False
+                    vision_stripped = True
+                    request_data["messages"] = _strip_image_content(
+                        request_data["messages"]
+                    )
+                    # Retry immediately without consuming a retry slot
+                    try:
+                        response = await self._client.chat.completions.create(**request_data)
+                        self._consecutive_timeouts = 0
+                        if not response.choices:
+                            raise LLMClientError("API returned empty response (no choices)")
+                        return response
+                    except Exception as inner_e:
+                        last_exception = inner_e
+                    continue
                 logger.error(f"LLM status error ({e.status_code}): {e}")
                 last_exception = e
 
@@ -134,6 +173,11 @@ async def call(
             f"All {self.max_retries} retry attempts exhausted"
         ) from last_exception
 
+    @property
+    def vision_supported(self) -> bool:
+        """False after first vision-unsupported 404; screenshots skipped for session."""
+        return self._vision_supported
+
     @property
     def consecutive_timeouts(self) -> int:
         """Get current consecutive timeout count."""

From 1bec84d0574f6fbe0cd84231be60668a6ec9acd8 Mon Sep 17 00:00:00 2001
From: felipefl142 <felipefrl1@hotmail.com>
Date: Sun, 19 Apr 2026 17:34:46 -0300
Subject: [PATCH 2/2] docs(strategies): clarify next_round must be a tool call

Some models emit raw JSON for `next_round` instead of invoking the
function, which stalls the shop loop. Reinforce in the description
that it must be called as a tool.
---
 src/balatrollm/strategies/aggressive/TOOLS.json   | 2 +-
 src/balatrollm/strategies/conservative/TOOLS.json | 2 +-
 src/balatrollm/strategies/default/TOOLS.json      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/balatrollm/strategies/aggressive/TOOLS.json b/src/balatrollm/strategies/aggressive/TOOLS.json
index bed389c..cb31768 100644
--- a/src/balatrollm/strategies/aggressive/TOOLS.json
+++ b/src/balatrollm/strategies/aggressive/TOOLS.json
@@ -187,7 +187,7 @@
       "function": {
         "name": "next_round",
         "strict": false,
-        "description": "Leave the shop and advance to blind selection.",
+        "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.",
         "parameters": {
           "type": "object",
           "properties": {
diff --git a/src/balatrollm/strategies/conservative/TOOLS.json b/src/balatrollm/strategies/conservative/TOOLS.json
index 52f2c2e..ef65c3e 100644
--- a/src/balatrollm/strategies/conservative/TOOLS.json
+++ b/src/balatrollm/strategies/conservative/TOOLS.json
@@ -187,7 +187,7 @@
       "function": {
         "name": "next_round",
         "strict": false,
-        "description": "Leave the shop and advance to blind selection.",
+        "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.",
         "parameters": {
           "type": "object",
           "properties": {
diff --git a/src/balatrollm/strategies/default/TOOLS.json b/src/balatrollm/strategies/default/TOOLS.json
index bed389c..cb31768 100644
--- a/src/balatrollm/strategies/default/TOOLS.json
+++ b/src/balatrollm/strategies/default/TOOLS.json
@@ -187,7 +187,7 @@
       "function": {
         "name": "next_round",
         "strict": false,
-        "description": "Leave the shop and advance to blind selection.",
+        "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.",
         "parameters": {
           "type": "object",
           "properties": {