From d76df23354345242d4338660527ead66056d3e4d Mon Sep 17 00:00:00 2001 From: felipefl142 Date: Sun, 19 Apr 2026 17:34:40 -0300 Subject: [PATCH 1/2] feat: send screenshots in prompt, add vision toggle, LLM-driven blind select - Embed a pre-call screenshot (base64 image_url) in the LLM prompt so vision-capable models see the current board. Screenshot still lands in `screenshots/{custom_id}.png` via Collector.peek_next_custom_id. - Add `vision` config (default true) + `--no-vision` CLI flag + `BALATROLLM_VISION` env var for text-only models (e.g. Ollama). - LLMClient auto-detects 404 "image input" errors, strips image blocks, and retries once; subsequent calls skip screenshots for the session. - Hand BLIND_SELECT to the LLM instead of auto-selecting, so strategies can choose to skip blinds via the existing `skip` tool. - Pass `tool_choice="required"` so models must emit a tool call rather than prose. - Add `_to_wine_path` helper so the `screenshot` RPC receives a Windows-style path when the game runs under Wine/Proton. --- src/balatrollm/bot.py | 78 +++++++++++++++++++++++++------------ src/balatrollm/cli.py | 4 ++ src/balatrollm/collector.py | 4 ++ src/balatrollm/config.py | 6 ++- src/balatrollm/llm.py | 46 +++++++++++++++++++++- 5 files changed, 112 insertions(+), 26 deletions(-) diff --git a/src/balatrollm/bot.py b/src/balatrollm/bot.py index 1e4a2c4..1939fdc 100644 --- a/src/balatrollm/bot.py +++ b/src/balatrollm/bot.py @@ -1,6 +1,7 @@ """Core LLM-powered Balatro bot implementation.""" import asyncio +import base64 import json import logging import time @@ -25,6 +26,23 @@ logger = logging.getLogger(__name__) +def _to_wine_path(path: Path) -> str: + """Convert a Linux path under Wine's drive_c to a Windows-style path. + + Balatro (via BalatroBot Lua mod) runs inside Wine/Proton and can only write + to Windows-style paths. This converts e.g.: + /home/user/.../drive_c/users/foo/bar.png + -> C:\\users\\foo\\bar.png + Falls back to the original string if drive_c is not in the path. + """ + parts = path.parts + try: + idx = next(i for i, p in enumerate(parts) if p == "drive_c") + return "C:\\" + "\\".join(parts[idx + 1 :]) + except StopIteration: + return str(path) + + class BotError(Exception): """Base exception for bot errors.""" @@ -66,6 +84,7 @@ async def __aenter__(self) -> "Bot": self._llm = LLMClient( base_url=self.config.base_url, api_key=self.config.api_key or "", + vision=self.config.vision, ) await self._llm.__aenter__() @@ -198,14 +217,11 @@ async def _run_game_loop(self, gamestate: dict[str, Any]) -> None: await self._balatro.call("gamestate") match current_state: - case "SELECTING_HAND" | "SHOP" | "SMODS_BOOSTER_OPENED": + case "SELECTING_HAND" | "SHOP" | "SMODS_BOOSTER_OPENED" | "BLIND_SELECT": response = await self._get_llm_response(gamestate) gamestate = await self._execute_tool_call(response) case "ROUND_EVAL": gamestate = await self._balatro.call("cash_out") - case "BLIND_SELECT": - # NOTE: This bot always selects and never skips blinds - gamestate = await self._balatro.call("select") case "GAME_OVER": self._finish_reason = "lost" logger.info("Game over!") @@ -220,6 +236,20 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion: assert self._llm is not None assert self._collector is not None + # Take screenshot BEFORE building the request so it can be included in the prompt + next_custom_id = self._collector.peek_next_custom_id() + screenshot_path = self._collector.screenshot_dir / f"{next_custom_id}.png" + screenshot_b64: str | None = None + try: + await self._balatro.call( + "screenshot", {"path": _to_wine_path(screenshot_path)} + ) + screenshot_b64 = base64.b64encode(screenshot_path.read_bytes()).decode() + except BalatroError as e: + logger.warning(f"Screenshot failed: {e}") + except Exception as e: + logger.warning(f"Screenshot read failed: {e}") + strategy_content = self.strategy.render_strategy(gamestate) gamestate_content = self.strategy.render_gamestate(gamestate) memory_content = self.strategy.render_memory( @@ -228,20 +258,28 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion: last_failure=self._last_failed_msg, ) - messages = [ + content: list[dict[str, Any]] = [ { - "role": "user", - "content": [ - { - "type": "text", - "text": strategy_content, - "cache_control": {"type": "ephemeral"}, - }, - {"type": "text", "text": gamestate_content}, - {"type": "text", "text": memory_content}, - ], - } + "type": "text", + "text": strategy_content, + "cache_control": {"type": "ephemeral"}, + }, ] + if screenshot_b64: + content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}, + } + ) + content.extend( + [ + {"type": "text", "text": gamestate_content}, + {"type": "text", "text": memory_content}, + ] + ) + + messages = [{"role": "user", "content": content}] tools = self.strategy.get_tools(gamestate["state"]) @@ -263,14 +301,6 @@ async def _get_llm_response(self, gamestate: dict[str, Any]) -> ChatCompletion: model_config=self.model_config, ) - try: - await self._balatro.call( - "screenshot", - {"path": str(self._collector.screenshot_dir / f"{custom_id}.png")}, - ) - except BalatroError as e: - logger.warning(f"Screenshot failed: {e}") - self._collector.write_response( id=str(time.time_ns() // 1_000_000), custom_id=custom_id, diff --git a/src/balatrollm/cli.py b/src/balatrollm/cli.py index 6557e3b..c5fae9f 100644 --- a/src/balatrollm/cli.py +++ b/src/balatrollm/cli.py @@ -48,6 +48,10 @@ def create_parser() -> argparse.ArgumentParser: parser.add_argument( "--views", action="store_true", help="Start HTTP server on port 12345 for views" ) + parser.add_argument( + "--no-vision", dest="vision", action="store_false", default=None, + help="Disable screenshots (required for non-vision models via Ollama)", + ) return parser diff --git a/src/balatrollm/collector.py b/src/balatrollm/collector.py index 90a98f8..9d87f2d 100644 --- a/src/balatrollm/collector.py +++ b/src/balatrollm/collector.py @@ -247,6 +247,10 @@ def _write_latest_json(self) -> None: f, ) + def peek_next_custom_id(self) -> str: + """Return the custom_id that the next write_request call will use, without advancing the counter.""" + return f"request-{self._request_count + 1:05}" + def write_request(self, body: dict[str, Any]) -> str: """Write request to requests.jsonl. Returns custom_id.""" self._request_count += 1 diff --git a/src/balatrollm/config.py b/src/balatrollm/config.py index 8e04f7e..4f3d84f 100644 --- a/src/balatrollm/config.py +++ b/src/balatrollm/config.py @@ -41,13 +41,14 @@ "base_url": "BALATROLLM_BASE_URL", "api_key": "BALATROLLM_API_KEY", "views": "BALATROLLM_VIEWS", + "vision": "BALATROLLM_VISION", } ################################################################################ # Types for config conversion ################################################################################ -BOOL_FIELDS: frozenset[str] = frozenset({"views"}) +BOOL_FIELDS: frozenset[str] = frozenset({"views", "vision"}) LIST_FIELDS: frozenset[str] = frozenset({"model", "seed", "deck", "stake", "strategy"}) STRING_FIELDS: frozenset[str] = frozenset({"host", "base_url", "api_key"}) INT_FIELDS: frozenset[str] = frozenset({"parallel", "port"}) @@ -149,6 +150,8 @@ def _load_from_args(args: Namespace) -> dict[str, Any]: for field_name in BOOL_FIELDS: if getattr(args, field_name, False): result[field_name] = True + if getattr(args, "vision", None) is False: + result["vision"] = False return result @@ -183,6 +186,7 @@ class Config: # Execution parallel: int = 1 views: bool = False + vision: bool = True # Connection host: str = "127.0.0.1" diff --git a/src/balatrollm/llm.py b/src/balatrollm/llm.py index d307eaf..ef40ca3 100644 --- a/src/balatrollm/llm.py +++ b/src/balatrollm/llm.py @@ -30,6 +30,19 @@ class LLMRetryExhaustedError(LLMClientError): pass +def _strip_image_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Remove image_url blocks from message content lists.""" + result = [] + for msg in messages: + content = msg.get("content") + if isinstance(content, list): + filtered = [b for b in content if b.get("type") != "image_url"] + result.append({**msg, "content": filtered}) + else: + result.append(msg) + return result + + @dataclass class LLMClient: """Async OpenAI client wrapper with retry logic.""" @@ -38,9 +51,11 @@ class LLMClient: api_key: str timeout: float = 240.0 # We assume that LLMs respond in 240s max_retries: int = 3 + vision: bool = True _client: openai.AsyncOpenAI | None = field(default=None, init=False, repr=False) _consecutive_timeouts: int = field(default=0, init=False, repr=False) + _vision_supported: bool = field(default=True, init=False, repr=False) async def __aenter__(self) -> "LLMClient": """Create the async OpenAI client.""" @@ -50,6 +65,7 @@ async def __aenter__(self) -> "LLMClient": timeout=self.timeout, ) self._consecutive_timeouts = 0 + self._vision_supported = self.vision return self async def __aexit__(self, *_: Any) -> None: @@ -71,10 +87,15 @@ async def call( "Client not connected. Use 'async with LLMClient() as client:'" ) + effective_messages = ( + _strip_image_content(messages) if not self._vision_supported else messages + ) + request_data: dict[str, Any] = { "model": model, - "messages": messages, + "messages": effective_messages, "tools": tools, + "tool_choice": "required", } if model_config: @@ -83,6 +104,7 @@ async def call( retry_delay = 1.0 last_exception: Exception | None = None + vision_stripped = False for attempt in range(self.max_retries): try: @@ -108,6 +130,23 @@ async def call( last_exception = e except openai.APIStatusError as e: + if e.status_code == 404 and "image input" in str(e).lower() and not vision_stripped: + logger.warning("Model does not support vision — disabling screenshots for this session") + self._vision_supported = False + vision_stripped = True + request_data["messages"] = _strip_image_content( + request_data["messages"] + ) + # Retry immediately without consuming a retry slot + try: + response = await self._client.chat.completions.create(**request_data) + self._consecutive_timeouts = 0 + if not response.choices: + raise LLMClientError("API returned empty response (no choices)") + return response + except Exception as inner_e: + last_exception = inner_e + continue logger.error(f"LLM status error ({e.status_code}): {e}") last_exception = e @@ -134,6 +173,11 @@ async def call( f"All {self.max_retries} retry attempts exhausted" ) from last_exception + @property + def vision_supported(self) -> bool: + """False after first vision-unsupported 404; screenshots skipped for session.""" + return self._vision_supported + @property def consecutive_timeouts(self) -> int: """Get current consecutive timeout count.""" From 1bec84d0574f6fbe0cd84231be60668a6ec9acd8 Mon Sep 17 00:00:00 2001 From: felipefl142 Date: Sun, 19 Apr 2026 17:34:46 -0300 Subject: [PATCH 2/2] docs(strategies): clarify next_round must be a tool call Some models emit raw JSON for `next_round` instead of invoking the function, which stalls the shop loop. Reinforce in the description that it must be called as a tool. --- src/balatrollm/strategies/aggressive/TOOLS.json | 2 +- src/balatrollm/strategies/conservative/TOOLS.json | 2 +- src/balatrollm/strategies/default/TOOLS.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/balatrollm/strategies/aggressive/TOOLS.json b/src/balatrollm/strategies/aggressive/TOOLS.json index bed389c..cb31768 100644 --- a/src/balatrollm/strategies/aggressive/TOOLS.json +++ b/src/balatrollm/strategies/aggressive/TOOLS.json @@ -187,7 +187,7 @@ "function": { "name": "next_round", "strict": false, - "description": "Leave the shop and advance to blind selection.", + "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.", "parameters": { "type": "object", "properties": { diff --git a/src/balatrollm/strategies/conservative/TOOLS.json b/src/balatrollm/strategies/conservative/TOOLS.json index 52f2c2e..ef65c3e 100644 --- a/src/balatrollm/strategies/conservative/TOOLS.json +++ b/src/balatrollm/strategies/conservative/TOOLS.json @@ -187,7 +187,7 @@ "function": { "name": "next_round", "strict": false, - "description": "Leave the shop and advance to blind selection.", + "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.", "parameters": { "type": "object", "properties": { diff --git a/src/balatrollm/strategies/default/TOOLS.json b/src/balatrollm/strategies/default/TOOLS.json index bed389c..cb31768 100644 --- a/src/balatrollm/strategies/default/TOOLS.json +++ b/src/balatrollm/strategies/default/TOOLS.json @@ -187,7 +187,7 @@ "function": { "name": "next_round", "strict": false, - "description": "Leave the shop and advance to blind selection.", + "description": "Exit the shop and advance to blind selection. You MUST invoke this as a function tool call — never output raw JSON. This is the only valid way to leave the shop; omitting the call or describing it in text will be treated as an error.", "parameters": { "type": "object", "properties": {