add importance level to references

lyzx2001 · lyzx2001 · commit 1b23b0c53e1f · 2026-04-03T23:00:34.000Z
diff --git a/ark/citation.py b/ark/citation.py
@@ -1177,7 +1177,8 @@ def parse_agent_selection(agent_output: str, candidates: list[Paper]) -> list[Pa
 # ═══════════════════════════════════════════════════════════
 
 def update_literature_yaml(literature_path: str, papers: list[Paper],
-                           cite_keys: list[str], agent_output: str) -> None:
+                           cite_keys: list[str], agent_output: str,
+                           contexts: list[str] = None) -> None:
     """Update literature.yaml with newly added citations for the writer agent."""
     import yaml
     from datetime import datetime
@@ -1197,9 +1198,10 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
 
     existing_titles = {r.get("title", "").lower() for r in data["references"] if isinstance(r, dict)}
 
-    for paper, key in zip(papers, cite_keys):
+    for idx, (paper, key) in enumerate(zip(papers, cite_keys)):
         if paper.title.lower() in existing_titles:
             continue
+        ctx = (contexts[idx] if contexts and idx < len(contexts) else "") or ""
         entry = {
             "title": paper.title,
             "authors": ", ".join(paper.authors[:3]) + (" et al." if len(paper.authors) > 3 else ""),
@@ -1211,13 +1213,20 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
         }
         if paper.abstract:
             entry["abstract"] = paper.abstract[:500]
+        # Mark importance based on Deep Research context
+        if ctx:
+            entry["context"] = ctx
+            ctx_lower = ctx.lower()
+            if "must cite" in ctx_lower or "essential" in ctx_lower or "critical" in ctx_lower or "closest prior work" in ctx_lower:
+                entry["importance"] = "critical"
         data["references"].append(entry)
 
     lit_file.write_text(yaml.dump(data, allow_unicode=True, default_flow_style=False))
 
 
 def _write_needs_check_to_literature(literature_path: str, titles: list[str],
-                                     cite_keys: list[str] = None) -> None:
+                                     cite_keys: list[str] = None,
+                                     contexts: list[str] = None) -> None:
     """Append [NEEDS-CHECK] entries to literature.yaml for papers not found in any API."""
     import yaml
     from datetime import datetime
@@ -1247,6 +1256,12 @@ def _write_needs_check_to_literature(literature_path: str, titles: list[str],
             }
             if cite_keys and i < len(cite_keys):
                 entry["bibtex_key"] = cite_keys[i]
+            ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
+            if ctx:
+                entry["context"] = ctx
+                ctx_lower = ctx.lower()
+                if "must cite" in ctx_lower or "essential" in ctx_lower or "critical" in ctx_lower or "closest prior work" in ctx_lower:
+                    entry["importance"] = "critical"
             data["needs_check"].append(entry)
 
     lit_file.write_text(yaml.dump(data, allow_unicode=True, default_flow_style=False))
@@ -1271,6 +1286,7 @@ def bootstrap_citations(
     search_queries: list[str] = None,
     authors: list[str] = None,
     years: list = None,
+    contexts: list[str] = None,
 ) -> BootstrapResult:
     """Given a list of paper titles, search APIs, fetch BibTeX, write to bib.
 
@@ -1290,9 +1306,11 @@ def bootstrap_citations(
     """
     found_papers = []
     found_keys = []
+    found_contexts = []
     needs_check_titles = []
     needs_check_authors = []
     needs_check_years = []
+    needs_check_contexts = []
 
     for i, title in enumerate(titles):
         title = title.strip()
@@ -1315,13 +1333,20 @@ def bootstrap_citations(
                 paper = _search_by_title(keywords)
 
         if paper:
+            # Use Deep Research context as fallback abstract if API didn't provide one
+            ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
+            if not paper.abstract and ctx:
+                paper.abstract = ctx
             found_papers.append(paper)
+            found_contexts.append(ctx)
         else:
             needs_check_titles.append(title)
             nc_author = (authors[i] if authors and i < len(authors) else "") or ""
             nc_year = (years[i] if years and i < len(years) else 0) or 0
+            nc_ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
             needs_check_authors.append(nc_author)
             needs_check_years.append(nc_year)
+            needs_check_contexts.append(nc_ctx)
 
     # Fetch BibTeX and write to references.bib
     if found_papers:
@@ -1336,9 +1361,11 @@ def bootstrap_citations(
 
     # Update literature.yaml (found papers + needs-check)
     if found_papers and found_keys:
-        update_literature_yaml(literature_path, found_papers, found_keys, "")
+        update_literature_yaml(literature_path, found_papers, found_keys, "",
+                               contexts=found_contexts)
     if needs_check_titles:
-        _write_needs_check_to_literature(literature_path, needs_check_titles, needs_check_keys)
+        _write_needs_check_to_literature(literature_path, needs_check_titles, needs_check_keys,
+                                         needs_check_contexts)
 
     return BootstrapResult(
         found=found_papers,
@@ -1358,21 +1385,28 @@ def _search_by_title(title: str) -> Paper | None:
     def _matches(paper: Paper) -> bool:
         return title_similarity(title, paper.title) >= _SIMILARITY_THRESHOLD
 
-    # Try DBLP first
+    # Try DBLP first — prefer published over preprint if both match
     dblp_results = _search_dblp(title, max_results=5)
     time.sleep(_DBLP_DELAY)
-    for p in dblp_results:
-        if _matches(p):
-            return p
+    dblp_matches = [p for p in dblp_results if _matches(p)]
+    if dblp_matches:
+        # Prefer published version
+        published = [p for p in dblp_matches if _is_published(p)]
+        best = published[0] if published else dblp_matches[0]
+        if not best.abstract:
+            _supplement_abstract(best)
+        return best
 
     # Try CrossRef
     cr_results = _search_crossref(title, max_results=3)
     time.sleep(_CROSSREF_DELAY)
     for p in cr_results:
         if _matches(p):
+            if not p.abstract:
+                _supplement_abstract(p)
             return p
 
-    # Try Semantic Scholar (has good title matching)
+    # Try Semantic Scholar (has good title matching + abstract)
     s2_results = _search_semantic_scholar(title, max_results=3)
     for p in s2_results:
         if _matches(p):
@@ -1381,6 +1415,16 @@ def _matches(paper: Paper) -> bool:
     return None
 
 
+def _supplement_abstract(paper: Paper) -> None:
+    """Try to get abstract from Semantic Scholar for a paper that lacks one."""
+    s2_results = _search_semantic_scholar(paper.title, max_results=1)
+    if s2_results and title_similarity(paper.title, s2_results[0].title) >= _SIMILARITY_THRESHOLD:
+        if s2_results[0].abstract:
+            paper.abstract = s2_results[0].abstract
+        if s2_results[0].citation_count and not paper.citation_count:
+            paper.citation_count = s2_results[0].citation_count
+
+
 def _extract_keywords_from_title(title: str) -> str:
     """Extract meaningful keywords from a paper title for broader search.
 
@@ -1428,18 +1472,14 @@ def mark_needs_check_in_tex(bib_path: str, tex_dir: str) -> int:
 
         for key in needs_check_keys:
             # Match \cite{key}, \citep{key}, \citet{key} not already followed by marker
-            for pattern in [
-                rf"(\\cite\{{{key}\}})(?!\s*\\textcolor)",
-                rf"(\\citep\{{{key}\}})(?!\s*\\textcolor)",
-                rf"(\\citet\{{{key}\}})(?!\s*\\textcolor)",
-            ]:
-                replacement = rf"\1{marker}"
-                new_content, count = re.subn(pattern, replacement, new_content)
+            for cite_cmd in ["cite", "citep", "citet"]:
+                pattern = rf"(\\{cite_cmd}\{{{key}\}})(?!\s*\\textcolor)"
+                # Use a function replacement to avoid \t interpretation issues
+                def _add_marker(m, _marker=marker):
+                    return m.group(1) + _marker
+                new_content, count = re.subn(pattern, _add_marker, new_content)
                 marked += count
 
-            # Also handle \cite{a,key,b} — key inside multi-cite
-            # This is harder; for now just handle the simple cases above
-
         if new_content != content:
             tex_file.write_text(new_content)
 
diff --git a/ark/execution.py b/ark/execution.py
@@ -973,27 +973,37 @@ def _get_literature_context_for_task(self, task: dict) -> str:
                     title = entry.get("title", "")
                     bibtex_key = entry.get("bibtex_key", entry.get("key", entry.get("cite_key", "")))
                     abstract = entry.get("abstract", entry.get("summary", ""))
+                    context = entry.get("context", "")
                     venue = entry.get("venue", "")
                     year = entry.get("year", "")
+                    importance = entry.get("importance", "")
                     if title:
                         cite_str = f" (cite: \\cite{{{bibtex_key}}})" if bibtex_key else ""
                         venue_str = f" [{venue} {year}]" if venue else ""
-                        lines.append(f"- **{title}**{cite_str}{venue_str}")
+                        imp_str = " **[MUST CITE]**" if importance == "critical" else ""
+                        lines.append(f"- **{title}**{cite_str}{venue_str}{imp_str}")
                         if abstract:
                             lines.append(f"  Abstract: {abstract[:400]}")
+                        elif context:
+                            lines.append(f"  Context: {context[:400]}")
                 elif isinstance(entry, str):
                     lines.append(f"- {entry}")
 
             # Add [NEEDS-CHECK] citations
             needs_check = lit_data.get("needs_check", [])
             if needs_check:
-                lines.append("\n## [NEEDS-CHECK] Citations (unverified — mark in text)")
+                lines.append("\n## [NEEDS-CHECK] Citations (unverified)")
                 for nc in needs_check:
                     if isinstance(nc, dict):
                         nc_title = nc.get("title", "")
                         nc_key = nc.get("bibtex_key", "")
+                        nc_importance = nc.get("importance", "")
+                        nc_context = nc.get("context", "")
                         if nc_title and nc_key:
-                            lines.append(f"- **{nc_title}** (cite: \\cite{{{nc_key}}}) — **[NEEDS-CHECK]**")
+                            imp_str = " **[MUST CITE]**" if nc_importance == "critical" else ""
+                            lines.append(f"- **{nc_title}** (cite: \\cite{{{nc_key}}}) [NEEDS-CHECK]{imp_str}")
+                            if nc_context:
+                                lines.append(f"  Context: {nc_context[:400]}")
 
             if len(lines) > 1:
                 lines.append(f"\nPlease cite these references in Related Work and other appropriate sections.")
diff --git a/ark/pipeline.py b/ark/pipeline.py
@@ -701,15 +701,17 @@ def _bootstrap_citations_from_deep_research(self):
 - "authors": first author surname (e.g. "Vaswani")
 - "year": publication year as integer (e.g. 2017)
 - "query": a search query to find it (title + author + year)
+- "context": a 1-2 sentence summary of what the report says about this paper (what it does, why it matters)
 
 Return a JSON array. Example:
 [
-  {{"title": "Attention Is All You Need", "authors": "Vaswani", "year": 2017, "query": "Attention Is All You Need Vaswani 2017"}},
-  {{"title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": "Devlin", "year": 2019, "query": "BERT Pre-training Deep Bidirectional Transformers Devlin 2019"}}
+  {{"title": "Attention Is All You Need", "authors": "Vaswani", "year": 2017, "query": "Attention Is All You Need Vaswani 2017", "context": "Introduces the Transformer architecture based solely on attention mechanisms, replacing recurrence and convolutions."}},
+  {{"title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": "Devlin", "year": 2019, "query": "BERT Pre-training Deep Bidirectional Transformers Devlin 2019", "context": "Proposes bidirectional pre-training for language representations, achieving SOTA on multiple NLP benchmarks."}}
 ]
 
 Rules:
 - "title" must be the paper's actual full title as it would appear on the paper itself
+- "context" should summarize what the report says about this paper, NOT what you think the paper is about
 - "query" should include the title plus first author surname and year to help search
 - If only an abbreviation is given (e.g. "TimeGAN by Yoon et al., 2019"), infer the full title for "title" and construct a rich "query"
 - Do NOT include book titles, dataset names, or tool names
@@ -732,12 +734,14 @@ def _bootstrap_citations_from_deep_research(self):
         queries = [p["query"] for p in papers_info]
         authors_list = [p.get("authors", "") for p in papers_info]
         years_list = [p.get("year", 0) for p in papers_info]
+        contexts_list = [p.get("context", "") for p in papers_info]
         self.log_step(f"Extracted {len(titles)} paper titles, searching APIs...", "progress")
 
         # Step 2: Search APIs and fetch BibTeX (use queries for search, titles for display)
         result = bootstrap_citations(
             titles, bib_path, literature_path,
             search_queries=queries, authors=authors_list, years=years_list,
+            contexts=contexts_list,
         )
 
         # Step 3: Log results
@@ -835,6 +839,7 @@ def _parse_paper_info_list(self, agent_output: str) -> list:
                                 "query": p.get("query", p.get("title", "")),
                                 "authors": p.get("authors", ""),
                                 "year": p.get("year", 0),
+                                "context": p.get("context", ""),
                             }
                             for p in parsed
                             if isinstance(p, dict) and p.get("title")