Skip to content

Commit 3e85ca5

Browse files
authored
Merge pull request #1 from dekrt/copilot/fix-calculate-and-send-job
Make arXiv retrieval resilient to transient 5xx API failures in `calculate-and-send`
2 parents 4f73d5d + f0df45c commit 3e85ca5

2 files changed

Lines changed: 72 additions & 2 deletions

File tree

src/zotero_arxiv_daily/retriever/arxiv_retriever.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
DOWNLOAD_TIMEOUT = (10, 60)
2020
PDF_EXTRACT_TIMEOUT = 180
2121
TAR_EXTRACT_TIMEOUT = 180
22+
RETRYABLE_ARXIV_STATUSES = {429, 500, 502, 503, 504}
2223

2324

2425
def _download_file(url: str, path: str) -> None:
@@ -137,19 +138,31 @@ def _retrieve_raw_papers(self) -> list[ArxivResult]:
137138
batch_retry_delay = 30
138139
for i in range(0, len(all_paper_ids), 20):
139140
search = arxiv.Search(id_list=all_paper_ids[i:i + 20])
141+
batch_succeeded = False
140142
for attempt in range(max_batch_retries):
141143
try:
142144
batch = list(client.results(search))
143145
bar.update(len(batch))
144146
raw_papers.extend(batch)
147+
batch_succeeded = True
145148
break
146149
except arxiv.HTTPError as exc:
147-
if exc.status == 429 and attempt < max_batch_retries - 1:
150+
status = getattr(exc, "status", None)
151+
if status in RETRYABLE_ARXIV_STATUSES and attempt < max_batch_retries - 1:
148152
wait = batch_retry_delay * (attempt + 1)
149-
logger.warning(f"arXiv API 429 on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s")
153+
logger.warning(
154+
f"arXiv API {status} on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s"
155+
)
150156
sleep(wait)
157+
elif status in RETRYABLE_ARXIV_STATUSES:
158+
logger.warning(
159+
f"Skipping batch {i // 20} after {max_batch_retries} retries due to arXiv API {status}"
160+
)
161+
break
151162
else:
152163
raise
164+
if not batch_succeeded:
165+
logger.warning(f"No papers retrieved for batch {i // 20}")
153166
if i + 20 < len(all_paper_ids):
154167
sleep(3)
155168
bar.close()

tests/retriever/test_arxiv_retriever.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from types import SimpleNamespace
55

66
import feedparser
7+
import pytest
78

89
from zotero_arxiv_daily.retriever.arxiv_retriever import ArxivRetriever, _run_with_hard_timeout
910
import zotero_arxiv_daily.retriever.arxiv_retriever as arxiv_retriever
@@ -88,3 +89,59 @@ def test_run_with_hard_timeout_returns_none_on_failure(monkeypatch):
8889
)
8990
assert result is None
9091
assert "boom" in warnings[0]
92+
93+
94+
def test_retrieve_raw_papers_skips_batch_after_retryable_http_error(config, mock_feedparser, monkeypatch):
95+
from omegaconf import open_dict
96+
97+
with open_dict(config):
98+
config.executor.debug = True
99+
100+
monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None)
101+
warnings: list[str] = []
102+
monkeypatch.setattr(arxiv_retriever, "logger", SimpleNamespace(warning=warnings.append))
103+
104+
class FakeHTTPError(Exception):
105+
def __init__(self, status):
106+
self.status = status
107+
108+
class FakeClient:
109+
def __init__(self, **kw):
110+
pass
111+
112+
def results(self, search):
113+
raise FakeHTTPError(503)
114+
115+
monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError)
116+
monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient)
117+
118+
retriever = ArxivRetriever(config)
119+
assert retriever._retrieve_raw_papers() == []
120+
assert any("Skipping batch 0 after 5 retries due to arXiv API 503" in warning for warning in warnings)
121+
122+
123+
def test_retrieve_raw_papers_raises_non_retryable_http_error(config, mock_feedparser, monkeypatch):
124+
from omegaconf import open_dict
125+
126+
with open_dict(config):
127+
config.executor.debug = True
128+
129+
monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None)
130+
131+
class FakeHTTPError(Exception):
132+
def __init__(self, status):
133+
self.status = status
134+
135+
class FakeClient:
136+
def __init__(self, **kw):
137+
pass
138+
139+
def results(self, search):
140+
raise FakeHTTPError(400)
141+
142+
monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError)
143+
monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient)
144+
145+
retriever = ArxivRetriever(config)
146+
with pytest.raises(FakeHTTPError):
147+
retriever._retrieve_raw_papers()

0 commit comments

Comments
 (0)