kaust-ark
diff --git a/‎ark/agents.py‎
Lines changed: 131 additions & 8 deletions b/‎ark/agents.py‎
Lines changed: 131 additions & 8 deletions
diff --git a/‎ark/pipeline.py‎
Lines changed: 53 additions & 10 deletions b/‎ark/pipeline.py‎
Lines changed: 53 additions & 10 deletions
diff --git a/‎ark/webapp/routes.py‎
Lines changed: 35 additions & 2 deletions b/‎ark/webapp/routes.py‎
Lines changed: 35 additions & 2 deletions
@@ -1,6 +1,7 @@
 """AgentMixin: agent execution, output parsing, rate limit handling."""
 from __future__ import annotations
 
+import json
 import os
 import re
 import signal
@@ -11,6 +12,58 @@
 from datetime import datetime, timedelta
 from pathlib import Path
 
+
+def _parse_claude_json(stdout: str) -> dict | None:
+    """Parse output of `claude --output-format json`. Returns None on any failure.
+
+    Tolerates trailing whitespace and the rare case where stdout has leading
+    non-JSON debug output by scanning for the final result-shaped object.
+    Never raises — callers fall back to treating stdout as plain text.
+    """
+    text = (stdout or "").strip()
+    if not text:
+        return None
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        # Last-resort: locate the final result envelope
+        marker = '{"type":"result"'
+        start = text.rfind(marker)
+        if start == -1:
+            return None
+        try:
+            return json.loads(text[start:])
+        except json.JSONDecodeError:
+            return None
+
+
+def _extract_usage(parsed: dict) -> dict:
+    """Pull token/cost fields out of parsed claude JSON. Zero-default so callers
+    don't need null checks. Always returns a complete dict shape."""
+    parsed = parsed or {}
+    u = parsed.get("usage") or {}
+    model_usage = parsed.get("modelUsage") or {}
+    model = next(iter(model_usage), "")
+    return {
+        "model": model,
+        "input_tokens": int(u.get("input_tokens") or 0),
+        "output_tokens": int(u.get("output_tokens") or 0),
+        "cache_read_tokens": int(u.get("cache_read_input_tokens") or 0),
+        "cache_creation_tokens": int(u.get("cache_creation_input_tokens") or 0),
+        "cost_usd": float(parsed.get("total_cost_usd") or 0.0),
+        "duration_api_ms": int(parsed.get("duration_api_ms") or 0),
+    }
+
+
+def _fmt_tok(n: int) -> str:
+    """Format a token count as compact human-readable (e.g. 12.3k, 1.2M)."""
+    n = int(n or 0)
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"{n / 1_000:.1f}k"
+    return str(n)
+
 from ark.paths import get_config_dir
 from ark.ui import (
     ElapsedTimer, RateLimitCountdown, agent_styled, styled, Style, Icons,
@@ -434,7 +487,7 @@ def run_agent(self, agent_type: str, task: str, timeout: int = 1800,
                         "claude", "-p", full_prompt,
                         "--permission-mode", "bypassPermissions",
                         "--no-session-persistence",
-                        "--output-format", "text",
+                        "--output-format", "json",
                         "--append-system-prompt", self._build_path_boundary(),
                     ]
                     ark_model = self._get_ark_model()
@@ -469,10 +522,23 @@ def run_agent(self, agent_type: str, task: str, timeout: int = 1800,
 
                 timer.start()
                 result = ""
+                usage_record = None  # populated when claude returns parseable JSON
 
                 try:
                     stdout, stderr = process.communicate(timeout=timeout)
-                    result = stdout
+                    # claude --output-format json: parse the envelope, extract `result`
+                    # field for downstream and `usage` for cost tracking. Fall back to
+                    # raw stdout on parse failure so the existing empty-run / failure
+                    # paths still trigger normally.
+                    if self.model == "claude":
+                        parsed = _parse_claude_json(stdout)
+                        if parsed is not None:
+                            result = parsed.get("result", "") or ""
+                            usage_record = _extract_usage(parsed)
+                        else:
+                            result = stdout
+                    else:
+                        result = stdout
 
                     if stderr:
                         stderr_lower = stderr.lower()
@@ -517,7 +583,17 @@ def run_agent(self, agent_type: str, task: str, timeout: int = 1800,
                     timer.stop()
                     self.log(f"Agent {agent_type} timed out ({timeout}s)", "WARN")
                     stdout, _ = process.communicate()
-                    result = stdout
+                    # JSON envelope is usually missing on timeout (truncated mid-stream).
+                    # Try once; on failure fall back to raw text and let empty-run handle it.
+                    if self.model == "claude":
+                        parsed = _parse_claude_json(stdout)
+                        if parsed is not None:
+                            result = parsed.get("result", "") or ""
+                            usage_record = _extract_usage(parsed)
+                        else:
+                            result = stdout
+                    else:
+                        result = stdout
 
                 watchdog.stop()
                 timer.stop()
@@ -589,31 +665,78 @@ def run_agent(self, agent_type: str, task: str, timeout: int = 1800,
                     start_time = time.time()
                     continue
                 self.send_notification("Agent Error Failed", f"{agent_type}: {e}", priority="critical")
-                self._agent_stats.append({
+                err_stat = {
                     "agent_type": agent_type,
                     "elapsed_seconds": elapsed,
                     "prompt_len": 0,
                     "output_len": 0,
                     "timestamp": datetime.now().isoformat(),
                     "error": str(e),
-                })
+                    # Zero-default cost fields so aggregation never sees missing keys
+                    "model": "",
+                    "input_tokens": 0,
+                    "output_tokens": 0,
+                    "cache_read_tokens": 0,
+                    "cache_creation_tokens": 0,
+                    "cost_usd": 0.0,
+                    "duration_api_ms": 0,
+                }
+                self._agent_stats.append(err_stat)
+                try:
+                    self._write_cost_report()
+                except Exception:
+                    pass
                 return ""
 
         timer.stop()
         self.log_step(f"{Icons.for_agent(agent_type)} {agent_styled(agent_type, f'[{agent_type}]')} completed ({elapsed}s)", "success")
 
+        # One-line cost summary (only when claude returned parseable usage)
+        if usage_record:
+            in_tok = usage_record["input_tokens"]
+            out_tok = usage_record["output_tokens"]
+            cr = usage_record["cache_read_tokens"]
+            cc = usage_record["cache_creation_tokens"]
+            cached_in = cr + cc
+            total_in = in_tok + cached_in
+            hit_pct = int(100 * cr / total_in) if total_in else 0
+            self.log_step(
+                f"  💰 ${usage_record['cost_usd']:.4f}  "
+                f"in:{_fmt_tok(in_tok)}  out:{_fmt_tok(out_tok)}  "
+                f"cache:{_fmt_tok(cached_in)}({hit_pct}% hit)",
+                "info"
+            )
+
         # Agent summary
         summary_items = self._summarize_agent_output(agent_type, result)
         if summary_items:
             self.log_summary_box(f"{agent_type.upper()} Summary", summary_items)
 
-        # Cost tracking
-        self._agent_stats.append({
+        # Cost tracking — extend with real token/cost when claude JSON was parsed
+        stat = {
             "agent_type": agent_type,
             "elapsed_seconds": elapsed,
             "prompt_len": len(full_prompt),
             "output_len": len(result) if result else 0,
             "timestamp": datetime.now().isoformat(),
-        })
+            # Zero-defaults so cost_report aggregation never sees missing keys
+            "model": "",
+            "input_tokens": 0,
+            "output_tokens": 0,
+            "cache_read_tokens": 0,
+            "cache_creation_tokens": 0,
+            "cost_usd": 0.0,
+            "duration_api_ms": 0,
+        }
+        if usage_record:
+            stat.update(usage_record)
+        self._agent_stats.append(stat)
+
+        # Live cost report — written after every agent so the webapp SSE stream
+        # can pick up updates within ~2s. Failures here must never break the run.
+        try:
+            self._write_cost_report()
+        except Exception as exc:
+            self.log(f"  cost report write failed: {exc}", "WARN")
 
         return result
@@ -1755,36 +1755,79 @@ def _send_dev_phase_telegram(self, event: str, current: int, total: int):
             pass
 
     def _write_cost_report(self):
-        """Write per-agent and total cost/stats to cost_report.yaml."""
+        """Write per-agent and total cost/stats to cost_report.yaml.
+
+        Called after every agent invocation so the webapp SSE stream can pick
+        up live updates within ~2s. Writes atomically (.tmp + os.replace) so
+        readers never see a partial file. Aggregates real token & USD fields
+        when the claude JSON envelope was parsed; falls back to character
+        counts otherwise.
+        """
         if not self._agent_stats:
             return
 
-        # Aggregate per agent type
+        # Aggregate per agent type. Each bucket carries both legacy char-count
+        # fields (for backwards compat with telegram_daemon / older tests) and
+        # the new token + cost fields populated from claude JSON output.
         by_type = {}
         for stat in self._agent_stats:
             atype = stat["agent_type"]
             if atype not in by_type:
-                by_type[atype] = {"calls": 0, "total_seconds": 0, "total_prompt_len": 0, "total_output_len": 0}
-            by_type[atype]["calls"] += 1
-            by_type[atype]["total_seconds"] += stat.get("elapsed_seconds", 0)
-            by_type[atype]["total_prompt_len"] += stat.get("prompt_len", 0)
-            by_type[atype]["total_output_len"] += stat.get("output_len", 0)
+                by_type[atype] = {
+                    "calls": 0,
+                    "total_seconds": 0,
+                    "total_prompt_len": 0,
+                    "total_output_len": 0,
+                    "total_input_tokens": 0,
+                    "total_output_tokens": 0,
+                    "total_cache_read_tokens": 0,
+                    "total_cache_creation_tokens": 0,
+                    "total_cost_usd": 0.0,
+                }
+            b = by_type[atype]
+            b["calls"] += 1
+            b["total_seconds"] += stat.get("elapsed_seconds", 0)
+            b["total_prompt_len"] += stat.get("prompt_len", 0)
+            b["total_output_len"] += stat.get("output_len", 0)
+            b["total_input_tokens"] += stat.get("input_tokens", 0)
+            b["total_output_tokens"] += stat.get("output_tokens", 0)
+            b["total_cache_read_tokens"] += stat.get("cache_read_tokens", 0)
+            b["total_cache_creation_tokens"] += stat.get("cache_creation_tokens", 0)
+            b["total_cost_usd"] += float(stat.get("cost_usd", 0.0) or 0.0)
 
         total_calls = sum(d["calls"] for d in by_type.values())
         total_time = sum(d["total_seconds"] for d in by_type.values())
+        total_cost_usd = sum(d["total_cost_usd"] for d in by_type.values())
+        total_input_tokens = sum(d["total_input_tokens"] for d in by_type.values())
+        total_output_tokens = sum(d["total_output_tokens"] for d in by_type.values())
+        total_cache_read_tokens = sum(d["total_cache_read_tokens"] for d in by_type.values())
+        total_cache_creation_tokens = sum(d["total_cache_creation_tokens"] for d in by_type.values())
 
         report = {
             "generated_at": datetime.now().isoformat(),
             "total_agent_calls": total_calls,
             "total_agent_seconds": total_time,
+            "total_cost_usd": round(total_cost_usd, 6),
+            "total_input_tokens": total_input_tokens,
+            "total_output_tokens": total_output_tokens,
+            "total_cache_read_tokens": total_cache_read_tokens,
+            "total_cache_creation_tokens": total_cache_creation_tokens,
             "per_agent": by_type,
             "raw_stats": self._agent_stats[-100:],  # Keep last 100 entries
         }
 
         report_path = self.state_dir / "cost_report.yaml"
-        with open(report_path, "w") as f:
-            yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
-        self.log(f"Cost report written: {report_path} ({total_calls} calls, {total_time}s total)", "INFO")
+        tmp_path = report_path.with_suffix(".yaml.tmp")
+        try:
+            with open(tmp_path, "w") as f:
+                yaml.dump(report, f, default_flow_style=False, allow_unicode=True)
+            os.replace(tmp_path, report_path)
+        except Exception:
+            try:
+                tmp_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+            raise
 
     def run(self):
         """Main loop."""
 
@@ -483,6 +483,33 @@ def _read_current_iteration(project_dir: Path) -> int:
         return 0
 
 
+def _read_cost_report(project_dir: Path) -> dict:
+    """Read cost_report.yaml and return a slim summary suitable for the UI.
+
+    Drops `raw_stats` (last-100 entries) to keep the payload small for SSE.
+    Returns an empty dict if the file is missing or unreadable — the UI
+    treats that as "no cost data yet".
+    """
+    p = project_dir / "auto_research" / "state" / "cost_report.yaml"
+    if not p.exists():
+        return {}
+    try:
+        d = yaml.safe_load(p.read_text()) or {}
+    except Exception:
+        return {}
+    return {
+        "total_cost_usd": d.get("total_cost_usd", 0),
+        "total_input_tokens": d.get("total_input_tokens", 0),
+        "total_output_tokens": d.get("total_output_tokens", 0),
+        "total_cache_read_tokens": d.get("total_cache_read_tokens", 0),
+        "total_cache_creation_tokens": d.get("total_cache_creation_tokens", 0),
+        "total_agent_calls": d.get("total_agent_calls", 0),
+        "total_agent_seconds": d.get("total_agent_seconds", 0),
+        "per_agent": d.get("per_agent", {}),
+        "generated_at": d.get("generated_at"),
+    }
+
+
 _TEMPLATE_TITLES = {"Paper Title", "Title Text", "Insert Title Here", ""}
 
 def _read_paper_title(project_dir: Path) -> str:
@@ -1146,6 +1173,7 @@ async def api_get_project(project_id: str, request: Request):
             "environment": "ROCS Testbed" if project.slurm_job_id and not project.slurm_job_id.startswith("local") else "Local",
             "created_at": project.created_at.isoformat(),
             "updated_at": project.updated_at.isoformat(),
+            "cost_report": _read_cost_report(pdir),
         })
 
 
@@ -1524,12 +1552,17 @@ async def event_generator():
                 except Exception:
                     pass
 
-            # Also emit status
+            # Also emit status (includes live cost report so the dashboard
+            # cost panel updates within ~2s of every agent completion)
             with get_session(settings.db_path) as session:
                 p = get_project(session, project_id)
                 if p:
                     score = _read_project_score(pdir)
-                    payload = json.dumps({"status": p.status, "score": score})
+                    payload = json.dumps({
+                        "status": p.status,
+                        "score": score,
+                        "cost_report": _read_cost_report(pdir),
+                    })
                     yield f"event: status\ndata: {payload}\n\n"
 
             await asyncio.sleep(2)