razroo
diff --git a/‎README.md‎
Lines changed: 19 additions & 0 deletions b/‎README.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎state_trace/iso_trace_adapter.py‎
Lines changed: 242 additions & 0 deletions b/‎state_trace/iso_trace_adapter.py‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎state_trace/retrieval.py‎
Lines changed: 60 additions & 2 deletions b/‎state_trace/retrieval.py‎
Lines changed: 60 additions & 2 deletions
@@ -257,6 +257,25 @@ engine.store_agent_log_file("examples/data/agent_logs/marshmallow__marshmallow-1
 
 Supported inputs: normalized `agent_log` JSON, raw SWE-agent `.traj` files, raw OpenHands event JSON logs.
 
+### From iso-trace (Claude Code / Cursor / Codex / opencode sessions)
+
+If you've accumulated session history with [`@razroo/iso-trace`](https://www.npmjs.com/package/@razroo/iso-trace), feed it directly:
+
+```bash
+# Export a session via iso-trace's CLI
+npx @razroo/iso-trace export <session-id> --json --out session.json
+```
+
+```python
+from state_trace import MemoryEngine
+from state_trace.iso_trace_adapter import ingest_iso_trace_session
+
+engine = MemoryEngine(capacity_limit=256.0, namespace="my-repo")
+ingest_iso_trace_session(engine, "session.json")
+```
+
+The adapter reads iso-trace's documented Session → Turn → Event[] JSON and converts it to state-trace's `agent_log` format — typed nodes for files, edits, tests, errors. Months of accumulated harness history become queryable working memory without re-running the agent.
+
 ## Live solve-rate (next credibility step)
 
 `examples/swebench_verified_solve_rate.py` scaffolds end-to-end solve-rate measurement: state-trace brief → LLM patch proposal → SWE-bench-Verified prediction JSONL. It does not run the swebench docker harness; that step is documented in the script's header.
 
@@ -0,0 +1,242 @@
+"""Adapter for ingesting iso-trace session JSON into state-trace.
+
+`iso-trace` (https://www.npmjs.com/package/@razroo/iso-trace) parses Claude
+Code, Cursor, Codex, and OpenCode transcripts into a normalized
+Session → Turn → Event[] JSON shape. This adapter converts that JSON into
+state-trace's `agent_log` ingestion format and feeds it through the
+existing typed-node + edge pipeline, so users with months of accumulated
+session history can reuse it as state-trace working memory without
+re-running the agent.
+
+Schema reference (excerpt from @razroo/iso-trace's types.d.ts):
+
+    Session: { id, source, cwd, model, durationMs, turns, tokenUsage }
+    Turn:    { index, role, at, events }
+    Event:   message | tool_call | tool_result | file_op | token_usage
+
+Usage:
+
+    from state_trace import MemoryEngine
+    from state_trace.iso_trace_adapter import ingest_iso_trace_session
+
+    engine = MemoryEngine(capacity_limit=256.0, namespace="my-repo")
+    ingest_iso_trace_session(engine, "/path/to/exported-session.json")
+
+The adapter does not import anything from iso-trace itself — it operates
+on the documented JSON shape, so it works against any export iso-trace
+produces (and any compatible third-party emitter).
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from state_trace.utils import classify_observation_status, infer_action_kind
+
+
+def ingest_iso_trace_session(
+    engine: Any,
+    path: str | Path,
+    context: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Read an iso-trace session JSON file and ingest it into the engine.
+
+    Returns the normalized agent_log dict that was ingested (useful for
+    inspection / debugging). The actual graph mutation happens via
+    `engine.store_agent_log(...)`.
+    """
+    session = json.loads(Path(path).read_text(encoding="utf-8"))
+    normalized = iso_trace_session_to_agent_log(session, default_context=context)
+    engine.store_agent_log(normalized, context=context)
+    return normalized
+
+
+def iso_trace_session_to_agent_log(
+    session: dict[str, Any],
+    default_context: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Convert an iso-trace Session JSON dict to state-trace's agent_log format.
+
+    Pure transform — no graph mutation, no I/O. Useful for tests and for
+    callers that want to inspect or post-process the normalized form before
+    ingestion.
+    """
+    default_context = default_context or {}
+    turns = session.get("turns", []) or []
+
+    issue_title, issue_text = _extract_issue(turns)
+    repo = _extract_repo(session)
+    session_id = str(session.get("id") or default_context.get("session") or "iso-trace-session")
+    goal = str(default_context.get("goal") or f"resume from {session_id}")
+
+    steps: list[dict[str, Any]] = []
+    for index, turn in enumerate(turns):
+        step = _turn_to_step(turn, index)
+        if step is not None:
+            steps.append(step)
+
+    return {
+        "format": "agent_log",
+        "source": "iso_trace",
+        "source_url": _extract_source_url(session),
+        "session_id": session_id,
+        "issue_title": issue_title,
+        "issue_text": issue_text,
+        "goal": goal,
+        "repo": repo,
+        "steps": steps,
+        "submission_diff": "",
+        "submission_files": [],
+    }
+
+
+def _extract_issue(turns: list[dict[str, Any]]) -> tuple[str, str]:
+    """First user message becomes the issue title + text."""
+    for turn in turns:
+        for event in turn.get("events", []) or []:
+            if event.get("kind") == "message" and event.get("role") == "user":
+                text = str(event.get("text", "")).strip()
+                if text:
+                    title = text.split("\n", 1)[0][:160]
+                    return title, text
+    return "(no user prompt)", ""
+
+
+def _extract_repo(session: dict[str, Any]) -> str | None:
+    """Best-effort: derive a repo identifier from cwd basename."""
+    cwd = session.get("cwd")
+    if not cwd:
+        return None
+    return Path(str(cwd)).name or None
+
+
+def _extract_source_url(session: dict[str, Any]) -> str | None:
+    src = session.get("source") or {}
+    if not isinstance(src, dict):
+        return None
+    return src.get("path")
+
+
+def _turn_to_step(turn: dict[str, Any], index: int) -> dict[str, Any] | None:
+    """Squash one Turn's events into one agent_log step.
+
+    Mapping (lossy but functional):
+      - assistant `message`         -> step.thought
+      - first `tool_call`           -> step.action + action_kind
+      - first `tool_result`         -> step.observation + status
+      - all `file_op`               -> step.files
+      - tool_result.error           -> error signature
+
+    Turns with only message events still produce a step (thought-only).
+    Turns with no actionable content (e.g. system-only) are skipped.
+    """
+    events = turn.get("events", []) or []
+    if not events:
+        return None
+
+    thought_parts: list[str] = []
+    action: str = ""
+    action_kind: str | None = None
+    observation: str = ""
+    status: str | None = None
+    files: list[str] = []
+    error_signatures: list[str] = []
+    seen_tool_call = False
+
+    for event in events:
+        kind = event.get("kind")
+        if kind == "message":
+            role = event.get("role")
+            text = str(event.get("text", "")).strip()
+            if not text:
+                continue
+            if role == "assistant":
+                thought_parts.append(text)
+        elif kind == "tool_call":
+            if seen_tool_call:
+                continue
+            seen_tool_call = True
+            tool_name = str(event.get("name", "")).strip()
+            tool_input = event.get("input")
+            action = _format_tool_call(tool_name, tool_input)
+            action_kind = _infer_action_kind(tool_name, tool_input, action)
+        elif kind == "tool_result":
+            output = str(event.get("output", "")).strip()
+            err = event.get("error")
+            if err:
+                observation = output or str(err)
+                status = "error"
+                error_signatures.append(str(err)[:240])
+            else:
+                observation = output
+                if not status:
+                    status = classify_observation_status(observation)
+        elif kind == "file_op":
+            path = event.get("path")
+            if path and path not in files:
+                files.append(str(path))
+
+    # Skip turns with no actionable content (e.g. pure system messages).
+    if not (action or observation or thought_parts or files):
+        return None
+
+    step: dict[str, Any] = {
+        "index": index,
+        "action": action,
+        "thought": "\n".join(thought_parts).strip(),
+        "observation": observation,
+        "files": files,
+        "action_kind": action_kind or "message",
+        "status": status or "info",
+        "error_signatures": error_signatures,
+    }
+    return step
+
+
+def _format_tool_call(name: str, input_value: Any) -> str:
+    """Render a tool_call into the action-string shape state-trace expects.
+
+    Examples:
+      ("Edit", {"file_path": "src/auth.ts"})  -> 'edit "src/auth.ts"'
+      ("Bash", {"command": "pytest tests/"})  -> 'pytest tests/'
+      ("Read", {"file_path": "src/x.py"})     -> 'open "src/x.py"'
+    """
+    if not name:
+        return ""
+    lowered = name.lower()
+    if isinstance(input_value, dict):
+        # Common file-touching tool fields
+        for key in ("file_path", "path", "filename"):
+            if key in input_value:
+                file_arg = str(input_value[key])
+                if lowered in {"edit", "write", "multiedit"}:
+                    return f'edit "{file_arg}"'
+                if lowered in {"read", "view"}:
+                    return f'open "{file_arg}"'
+                if lowered == "glob":
+                    return f"find_file {file_arg}"
+                return f'{lowered} "{file_arg}"'
+        if "command" in input_value:
+            return str(input_value["command"])
+        if "query" in input_value:
+            return f'{lowered}: {input_value["query"]}'
+    return f"{lowered}: {json.dumps(input_value)[:200] if input_value else ''}".strip(": ")
+
+
+def _infer_action_kind(tool_name: str, input_value: Any, rendered_action: str) -> str:
+    """Map iso-trace tool names + rendered actions to state-trace's action_kind."""
+    lowered = tool_name.lower()
+    if lowered in {"edit", "write", "multiedit"}:
+        return "edit"
+    if lowered in {"read", "view"}:
+        return "open"
+    if lowered == "glob":
+        return "find_file"
+    if lowered == "grep":
+        return "search"
+    if lowered == "bash":
+        # Use shared heuristic to infer pytest/python/etc. from the command.
+        return infer_action_kind(rendered_action)
+    return lowered or "message"
@@ -1314,16 +1314,69 @@ def _brief_file_priority(node: dict[str, Any], profile: dict[str, Any]) -> float
     r"https?://(?:www\.)?gitlab\.com/[^/\s]+/[^/\s]+/-/blob/[^/\s]+/([^\s)#?]+)",
     re.IGNORECASE,
 )
+# Python module references like `astropy.modeling.separable` or
+# `django.utils.html.escape` mentioned in issue text. The trailing segment
+# could be a function, class, or submodule — we generate path candidates
+# for both interpretations.
+_PYTHON_MODULE_RE = re.compile(
+    r"\b([a-z][a-z0-9_]*(?:\.[a-z_][a-z0-9_]*){2,})\b"
+)
+
+
+def _module_to_path_candidates(text: str) -> list[str]:
+    """Translate Python module references in `text` into file path candidates.
+
+    `astropy.modeling.separable` produces both:
+      - `astropy/modeling/separable.py`     (the dotted name is a module)
+      - `astropy/modeling.py` + `astropy/modeling/separable.py` is also
+        plausible if the LAST segment is a function or class — we hedge by
+        also emitting the parent path when the last segment looks lowercase
+        (likely a function or submodule, not a class).
+
+    Conservative: only matches dotted names with at least 3 segments, all
+    lowercase (Python class names start uppercase, which we skip). Common
+    stdlib namespaces like `os.path` are short enough not to match.
+    """
+    seen: set[str] = set()
+    out: list[str] = []
+    for match in _PYTHON_MODULE_RE.findall(text):
+        segments = match.split(".")
+        # Skip if any segment looks like a class (capitalized) — those are
+        # type references, not module paths. PYTHON_MODULE_RE is already
+        # lowercase-only via the character class, but be defensive.
+        if any(seg and seg[0].isupper() for seg in segments):
+            continue
+        # Skip well-known non-module dotted names that are technical jargon.
+        joined = match.lower()
+        if joined.startswith(("e.g", "i.e", "vs.")) or joined in {"a.b.c"}:
+            continue
+        # Module-as-file: astropy.modeling.separable -> astropy/modeling/separable.py
+        candidate = "/".join(segments) + ".py"
+        if candidate not in seen:
+            seen.add(candidate)
+            out.append(candidate)
+        # Submodule-as-file (drop the final segment if it's plausibly a
+        # function or class name on the parent module):
+        # astropy.modeling.separable.separability_matrix
+        #   -> astropy/modeling/separable.py
+        if len(segments) >= 4:
+            parent_candidate = "/".join(segments[:-1]) + ".py"
+            if parent_candidate not in seen:
+                seen.add(parent_candidate)
+                out.append(parent_candidate)
+    return out
 
 
 def _extract_lexical_file_candidates(text: str) -> list[str]:
     """Mine repo-relative file paths from text, including ones embedded in
-    code-host URLs.
+    code-host URLs and inferred from Python module references.
 
     Normal trajectory ingestion uses `extract_file_paths`, which rejects URLs
     (they're rarely meaningful during edit/test loops). At cold start from an
     issue text, the golden file is far more often embedded in a github.com
-    blob URL than bare, so the fallback path mines both.
+    blob URL than bare, AND issue authors describe the bug location using
+    dotted Python module names (`astropy.modeling.separable_matrix`) instead
+    of repo-relative paths. The fallback mines all three forms.
     """
     candidates = list(extract_file_paths(text))
     seen = {candidate for candidate in candidates}
@@ -1334,6 +1387,11 @@ def _extract_lexical_file_candidates(text: str) -> list[str]:
                 continue
             candidates.append(path)
             seen.add(path)
+    for module_path in _module_to_path_candidates(text):
+        if module_path in seen:
+            continue
+        candidates.append(module_path)
+        seen.add(module_path)
     return candidates