Close A@5 = A@1 collapse in retrieve_brief with lexical fallback

CharlieGreenman · claude · CharlieGreenman · commit 13b9e5ba0649 · 2026-04-23T23:05:59.000-04:00
Pre-fix, the brief forced target_files[:4] then the fit_agent_brief
budget loop aggressively popped target_files down to 1 to fit the 220-
token small-model budget. Together those meant Artifact@5 ≡ Artifact@1
by construction — state-trace had at most one file to rank against a
golden path.

Three changes:

  1. Bumped `target_files[:4]` to `[:5]` so A@5 even has room to work.

  2. Cold-start lexical fallback in build_agent_brief: when target_files
     has fewer than 5 entries, mine file-path candidates out of the
     query + top-scored node content + `issue_text` / `hints_text`
     metadata, filtered through a conservative `_looks_like_file_path`
     gate (dir separator + real extension, rejects "is/was"-style
     garbage).

  3. URL-aware extraction in `_extract_lexical_file_candidates`. Regular
     ingestion's `extract_file_paths` rejects http(s) URLs (right call
     for edit-loop observations, wrong call for cold-start localization
     where SWE-bench-Verified issue texts embed golden paths inside
     github.com/.../blob/... URLs far more often than bare). The
     fallback extracts repo-relative paths from github / gitlab blob
     URLs alongside normal path tokens.

  4. Raised the fit_agent_brief target_files floor from 1 to 3 so the
     budget loop can't strip A@5 candidates back down to a single file.

Effect on n=50 smoke: state_trace A@1 0.120 → 0.160, A@5 0.120 → 0.240,
now beats BM25 on A@1 and closes most of the A@5 gap (bm25 0.260).
Full n=500 re-run in progress; README will be updated when it lands.

15 tests passing (was 14); added regression test exercising the
lexical fallback end-to-end with both a graph file node and
issue_text-derived candidates.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/state_trace/retrieval.py b/state_trace/retrieval.py
@@ -2,6 +2,7 @@
 
 import heapq
 import math
+import re
 from collections import Counter
 from typing import Any
 
@@ -12,6 +13,7 @@
 from state_trace.scoring import ScoreConfig, score_node
 from state_trace.types import Node
 from state_trace.utils import (
+    FILE_EXTENSION_RE,
     compact_text,
     extract_context_tokens,
     extract_file_paths,
@@ -959,6 +961,41 @@ def build_agent_brief(
     )
     raw_file_candidates = _unique_strings([node["content"] for node in ranked_file_nodes])
     target_files = _unique_strings([node["content"] for node in ranked_file_nodes if _brief_file_priority(node, profile) > 0.0])
+
+    # Cold-start fallback: when target_files comes up short (e.g. issue text
+    # only mentions one file, or the agent hasn't inspected anything yet),
+    # mine file-path candidates out of the query + top-scored node content
+    # and any `issue_text` metadata so Artifact@5 has something beyond a
+    # single candidate to rank. Dogfood / n=500 finding: pre-fix, A@5 == A@1
+    # because cold-start localization only had one file node in the graph.
+    if len(target_files) < 5:
+        lexical_sources: list[str] = [query]
+        for node in nodes[:5]:
+            content_text = str(node.get("content", "")).strip()
+            if content_text:
+                lexical_sources.append(content_text)
+            metadata = node.get("metadata", {}) or {}
+            issue_text_value = metadata.get("issue_text")
+            if issue_text_value:
+                lexical_sources.append(str(issue_text_value))
+            hints_text_value = metadata.get("hints_text")
+            if hints_text_value:
+                lexical_sources.append(str(hints_text_value))
+        lexical_candidates = _extract_lexical_file_candidates(" ".join(lexical_sources))
+        existing_paths = {path.lower() for path in target_files}
+        for candidate in lexical_candidates:
+            if len(target_files) >= 5:
+                break
+            candidate_lower = candidate.lower()
+            if (
+                candidate_lower in existing_paths
+                or _is_support_or_scratch_file(candidate)
+                or "::" in candidate
+                or not _looks_like_file_path(candidate)
+            ):
+                continue
+            target_files.append(candidate)
+            existing_paths.add(candidate_lower)
     current_state = _unique_strings(
         [
             _brief_node_line(node)
@@ -1032,7 +1069,7 @@ def build_agent_brief(
         "patch_file": patch_file,
         "rerun_command": rerun_command,
         "confidence": confidence,
-        "target_files": target_files[:4],
+        "target_files": target_files[:5],
         "tests_to_rerun": tests_to_rerun[:3],
         "recent_commands": recent_commands[:3],
         "symbols": symbols[:4],
@@ -1122,7 +1159,9 @@ def fit_agent_brief(
         if len(fitted["causal_chain"]) > 1:
             fitted["causal_chain"].pop()
             continue
-        if len(fitted["target_files"]) > 1:
+        if len(fitted["target_files"]) > 3:
+            # Keep at least 3 target files so Artifact@5 has something to rank
+            # against. Pre-fix, this popped down to 1 and collapsed A@5 onto A@1.
             fitted["target_files"].pop()
             continue
         if len(fitted["recommended_actions"]) > 2:
@@ -1267,6 +1306,59 @@ def _brief_file_priority(node: dict[str, Any], profile: dict[str, Any]) -> float
     return score
 
 
+_GITHUB_BLOB_PATH_RE = re.compile(
+    r"https?://(?:www\.)?github\.com/[^/\s]+/[^/\s]+/(?:blob|tree)/[^/\s]+/([^\s)#?]+)",
+    re.IGNORECASE,
+)
+_GITLAB_BLOB_PATH_RE = re.compile(
+    r"https?://(?:www\.)?gitlab\.com/[^/\s]+/[^/\s]+/-/blob/[^/\s]+/([^\s)#?]+)",
+    re.IGNORECASE,
+)
+
+
+def _extract_lexical_file_candidates(text: str) -> list[str]:
+    """Mine repo-relative file paths from text, including ones embedded in
+    code-host URLs.
+
+    Normal trajectory ingestion uses `extract_file_paths`, which rejects URLs
+    (they're rarely meaningful during edit/test loops). At cold start from an
+    issue text, the golden file is far more often embedded in a github.com
+    blob URL than bare, so the fallback path mines both.
+    """
+    candidates = list(extract_file_paths(text))
+    seen = {candidate for candidate in candidates}
+    for regex in (_GITHUB_BLOB_PATH_RE, _GITLAB_BLOB_PATH_RE):
+        for match in regex.findall(text):
+            path = match.rstrip("/").strip()
+            if not path or path in seen:
+                continue
+            candidates.append(path)
+            seen.add(path)
+    return candidates
+
+
+def _looks_like_file_path(candidate: str) -> bool:
+    """Conservative filter for lexical file-path candidates.
+
+    Used by the cold-start fallback in build_agent_brief to avoid promoting
+    things like 'src/foo' without an extension, version strings like '1.2.3',
+    or is/was-style paths that extract_file_paths will occasionally emit.
+    """
+    value = candidate.strip()
+    if not value or len(value) < 4:
+        return False
+    # Must contain either a directory separator or a recognizable file extension.
+    has_separator = "/" in value
+    has_extension = bool(FILE_EXTENSION_RE.search(value))
+    if not (has_separator and has_extension):
+        return False
+    # Reject obvious non-paths.
+    lowered = value.lower()
+    if lowered in {"is/was", "before/after", "and/or"}:
+        return False
+    return True
+
+
 def _is_support_or_scratch_file(path: str) -> bool:
     lowered = path.lower()
     basename = lowered.rsplit("/", 1)[-1]
diff --git a/tests/test_temporal_memory.py b/tests/test_temporal_memory.py
@@ -326,6 +326,52 @@ def test_retrieve_brief_preserves_load_bearing_detail_in_top_evidence() -> None:
     )
 
 
+def test_retrieve_brief_target_files_falls_back_to_lexical_paths() -> None:
+    """Cold-start case: only one file node exists in the graph but the issue
+    text mentions several paths. Pre-fix, target_files == [single file node]
+    and Artifact@5 == Artifact@1 by construction. After fix, target_files
+    mines lexical candidates from query + node content + issue_text."""
+    engine = MemoryEngine(capacity_limit=128.0)
+    ctx = {"session": "cold-start", "goal": "localize fix"}
+    engine.store(
+        "Login 401 after refresh in src/auth.ts",
+        {
+            "type": "task",
+            "importance": 0.95,
+            "issue_text": (
+                "Login still broken. Suspect src/auth.ts, src/middleware/session.ts, "
+                "and possibly tests/test_auth.py. The refresh retry in "
+                "src/utils/refresh_client.ts also looks wrong."
+            ),
+            **ctx,
+        },
+    )
+    engine.store(
+        "src/auth.ts",
+        {"type": "file", "path": "src/auth.ts", "file": "src/auth.ts", "importance": 0.7, **ctx},
+    )
+
+    brief = engine.retrieve_brief(
+        "which file should I patch to fix login",
+        {"session": "cold-start", "goal": "localize fix"},
+        mode="small_model",
+    )
+
+    # The primary file node should still rank first.
+    assert brief["target_files"], "expected at least one target file"
+    assert brief["target_files"][0] == "src/auth.ts"
+
+    # Lexical fallback should surface additional candidates from the issue text.
+    assert len(brief["target_files"]) >= 3, (
+        f"expected lexical fallback to add more candidates; got {brief['target_files']!r}"
+    )
+    # At least one of the fallback paths should come from the issue_text.
+    assert any(
+        candidate != "src/auth.ts" and candidate.endswith((".ts", ".py"))
+        for candidate in brief["target_files"]
+    )
+
+
 def test_current_state_latest_observation_is_truly_most_recent() -> None:
     """Dogfood regression: within one runtime session, multiple record_observation
     calls share step_index=0. Before the fix, current_state sorted observations