Skip to content

Commit 13b9e5b

Browse files
Close A@5 = A@1 collapse in retrieve_brief with lexical fallback
Pre-fix, the brief forced target_files[:4] then the fit_agent_brief budget loop aggressively popped target_files down to 1 to fit the 220- token small-model budget. Together those meant Artifact@5 ≡ Artifact@1 by construction — state-trace had at most one file to rank against a golden path. Three changes: 1. Bumped `target_files[:4]` to `[:5]` so A@5 even has room to work. 2. Cold-start lexical fallback in build_agent_brief: when target_files has fewer than 5 entries, mine file-path candidates out of the query + top-scored node content + `issue_text` / `hints_text` metadata, filtered through a conservative `_looks_like_file_path` gate (dir separator + real extension, rejects "is/was"-style garbage). 3. URL-aware extraction in `_extract_lexical_file_candidates`. Regular ingestion's `extract_file_paths` rejects http(s) URLs (right call for edit-loop observations, wrong call for cold-start localization where SWE-bench-Verified issue texts embed golden paths inside github.com/.../blob/... URLs far more often than bare). The fallback extracts repo-relative paths from github / gitlab blob URLs alongside normal path tokens. 4. Raised the fit_agent_brief target_files floor from 1 to 3 so the budget loop can't strip A@5 candidates back down to a single file. Effect on n=50 smoke: state_trace A@1 0.120 → 0.160, A@5 0.120 → 0.240, now beats BM25 on A@1 and closes most of the A@5 gap (bm25 0.260). Full n=500 re-run in progress; README will be updated when it lands. 15 tests passing (was 14); added regression test exercising the lexical fallback end-to-end with both a graph file node and issue_text-derived candidates. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e3ae59e commit 13b9e5b

2 files changed

Lines changed: 140 additions & 2 deletions

File tree

state_trace/retrieval.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import heapq
44
import math
5+
import re
56
from collections import Counter
67
from typing import Any
78

@@ -12,6 +13,7 @@
1213
from state_trace.scoring import ScoreConfig, score_node
1314
from state_trace.types import Node
1415
from state_trace.utils import (
16+
FILE_EXTENSION_RE,
1517
compact_text,
1618
extract_context_tokens,
1719
extract_file_paths,
@@ -959,6 +961,41 @@ def build_agent_brief(
959961
)
960962
raw_file_candidates = _unique_strings([node["content"] for node in ranked_file_nodes])
961963
target_files = _unique_strings([node["content"] for node in ranked_file_nodes if _brief_file_priority(node, profile) > 0.0])
964+
965+
# Cold-start fallback: when target_files comes up short (e.g. issue text
966+
# only mentions one file, or the agent hasn't inspected anything yet),
967+
# mine file-path candidates out of the query + top-scored node content
968+
# and any `issue_text` metadata so Artifact@5 has something beyond a
969+
# single candidate to rank. Dogfood / n=500 finding: pre-fix, A@5 == A@1
970+
# because cold-start localization only had one file node in the graph.
971+
if len(target_files) < 5:
972+
lexical_sources: list[str] = [query]
973+
for node in nodes[:5]:
974+
content_text = str(node.get("content", "")).strip()
975+
if content_text:
976+
lexical_sources.append(content_text)
977+
metadata = node.get("metadata", {}) or {}
978+
issue_text_value = metadata.get("issue_text")
979+
if issue_text_value:
980+
lexical_sources.append(str(issue_text_value))
981+
hints_text_value = metadata.get("hints_text")
982+
if hints_text_value:
983+
lexical_sources.append(str(hints_text_value))
984+
lexical_candidates = _extract_lexical_file_candidates(" ".join(lexical_sources))
985+
existing_paths = {path.lower() for path in target_files}
986+
for candidate in lexical_candidates:
987+
if len(target_files) >= 5:
988+
break
989+
candidate_lower = candidate.lower()
990+
if (
991+
candidate_lower in existing_paths
992+
or _is_support_or_scratch_file(candidate)
993+
or "::" in candidate
994+
or not _looks_like_file_path(candidate)
995+
):
996+
continue
997+
target_files.append(candidate)
998+
existing_paths.add(candidate_lower)
962999
current_state = _unique_strings(
9631000
[
9641001
_brief_node_line(node)
@@ -1032,7 +1069,7 @@ def build_agent_brief(
10321069
"patch_file": patch_file,
10331070
"rerun_command": rerun_command,
10341071
"confidence": confidence,
1035-
"target_files": target_files[:4],
1072+
"target_files": target_files[:5],
10361073
"tests_to_rerun": tests_to_rerun[:3],
10371074
"recent_commands": recent_commands[:3],
10381075
"symbols": symbols[:4],
@@ -1122,7 +1159,9 @@ def fit_agent_brief(
11221159
if len(fitted["causal_chain"]) > 1:
11231160
fitted["causal_chain"].pop()
11241161
continue
1125-
if len(fitted["target_files"]) > 1:
1162+
if len(fitted["target_files"]) > 3:
1163+
# Keep at least 3 target files so Artifact@5 has something to rank
1164+
# against. Pre-fix, this popped down to 1 and collapsed A@5 onto A@1.
11261165
fitted["target_files"].pop()
11271166
continue
11281167
if len(fitted["recommended_actions"]) > 2:
@@ -1267,6 +1306,59 @@ def _brief_file_priority(node: dict[str, Any], profile: dict[str, Any]) -> float
12671306
return score
12681307

12691308

1309+
_GITHUB_BLOB_PATH_RE = re.compile(
1310+
r"https?://(?:www\.)?github\.com/[^/\s]+/[^/\s]+/(?:blob|tree)/[^/\s]+/([^\s)#?]+)",
1311+
re.IGNORECASE,
1312+
)
1313+
_GITLAB_BLOB_PATH_RE = re.compile(
1314+
r"https?://(?:www\.)?gitlab\.com/[^/\s]+/[^/\s]+/-/blob/[^/\s]+/([^\s)#?]+)",
1315+
re.IGNORECASE,
1316+
)
1317+
1318+
1319+
def _extract_lexical_file_candidates(text: str) -> list[str]:
1320+
"""Mine repo-relative file paths from text, including ones embedded in
1321+
code-host URLs.
1322+
1323+
Normal trajectory ingestion uses `extract_file_paths`, which rejects URLs
1324+
(they're rarely meaningful during edit/test loops). At cold start from an
1325+
issue text, the golden file is far more often embedded in a github.com
1326+
blob URL than bare, so the fallback path mines both.
1327+
"""
1328+
candidates = list(extract_file_paths(text))
1329+
seen = {candidate for candidate in candidates}
1330+
for regex in (_GITHUB_BLOB_PATH_RE, _GITLAB_BLOB_PATH_RE):
1331+
for match in regex.findall(text):
1332+
path = match.rstrip("/").strip()
1333+
if not path or path in seen:
1334+
continue
1335+
candidates.append(path)
1336+
seen.add(path)
1337+
return candidates
1338+
1339+
1340+
def _looks_like_file_path(candidate: str) -> bool:
1341+
"""Conservative filter for lexical file-path candidates.
1342+
1343+
Used by the cold-start fallback in build_agent_brief to avoid promoting
1344+
things like 'src/foo' without an extension, version strings like '1.2.3',
1345+
or is/was-style paths that extract_file_paths will occasionally emit.
1346+
"""
1347+
value = candidate.strip()
1348+
if not value or len(value) < 4:
1349+
return False
1350+
# Must contain either a directory separator or a recognizable file extension.
1351+
has_separator = "/" in value
1352+
has_extension = bool(FILE_EXTENSION_RE.search(value))
1353+
if not (has_separator and has_extension):
1354+
return False
1355+
# Reject obvious non-paths.
1356+
lowered = value.lower()
1357+
if lowered in {"is/was", "before/after", "and/or"}:
1358+
return False
1359+
return True
1360+
1361+
12701362
def _is_support_or_scratch_file(path: str) -> bool:
12711363
lowered = path.lower()
12721364
basename = lowered.rsplit("/", 1)[-1]

tests/test_temporal_memory.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,52 @@ def test_retrieve_brief_preserves_load_bearing_detail_in_top_evidence() -> None:
326326
)
327327

328328

329+
def test_retrieve_brief_target_files_falls_back_to_lexical_paths() -> None:
330+
"""Cold-start case: only one file node exists in the graph but the issue
331+
text mentions several paths. Pre-fix, target_files == [single file node]
332+
and Artifact@5 == Artifact@1 by construction. After fix, target_files
333+
mines lexical candidates from query + node content + issue_text."""
334+
engine = MemoryEngine(capacity_limit=128.0)
335+
ctx = {"session": "cold-start", "goal": "localize fix"}
336+
engine.store(
337+
"Login 401 after refresh in src/auth.ts",
338+
{
339+
"type": "task",
340+
"importance": 0.95,
341+
"issue_text": (
342+
"Login still broken. Suspect src/auth.ts, src/middleware/session.ts, "
343+
"and possibly tests/test_auth.py. The refresh retry in "
344+
"src/utils/refresh_client.ts also looks wrong."
345+
),
346+
**ctx,
347+
},
348+
)
349+
engine.store(
350+
"src/auth.ts",
351+
{"type": "file", "path": "src/auth.ts", "file": "src/auth.ts", "importance": 0.7, **ctx},
352+
)
353+
354+
brief = engine.retrieve_brief(
355+
"which file should I patch to fix login",
356+
{"session": "cold-start", "goal": "localize fix"},
357+
mode="small_model",
358+
)
359+
360+
# The primary file node should still rank first.
361+
assert brief["target_files"], "expected at least one target file"
362+
assert brief["target_files"][0] == "src/auth.ts"
363+
364+
# Lexical fallback should surface additional candidates from the issue text.
365+
assert len(brief["target_files"]) >= 3, (
366+
f"expected lexical fallback to add more candidates; got {brief['target_files']!r}"
367+
)
368+
# At least one of the fallback paths should come from the issue_text.
369+
assert any(
370+
candidate != "src/auth.ts" and candidate.endswith((".ts", ".py"))
371+
for candidate in brief["target_files"]
372+
)
373+
374+
329375
def test_current_state_latest_observation_is_truly_most_recent() -> None:
330376
"""Dogfood regression: within one runtime session, multiple record_observation
331377
calls share step_index=0. Before the fix, current_state sorted observations

0 commit comments

Comments
 (0)