From 91e8649a539a241c848bd22c4eb13dfb948863bd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 5 Jun 2026 06:15:44 +0000 Subject: [PATCH 1/2] Initial plan From f0df45c1a371c171c1d49cce190e60d42d2d3537 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 5 Jun 2026 06:19:17 +0000 Subject: [PATCH 2/2] Handle transient arXiv API 5xx errors in batch retrieval --- .../retriever/arxiv_retriever.py | 17 +++++- tests/retriever/test_arxiv_retriever.py | 57 +++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/zotero_arxiv_daily/retriever/arxiv_retriever.py b/src/zotero_arxiv_daily/retriever/arxiv_retriever.py index 13e0a002f..f8ee2f7a5 100644 --- a/src/zotero_arxiv_daily/retriever/arxiv_retriever.py +++ b/src/zotero_arxiv_daily/retriever/arxiv_retriever.py @@ -19,6 +19,7 @@ DOWNLOAD_TIMEOUT = (10, 60) PDF_EXTRACT_TIMEOUT = 180 TAR_EXTRACT_TIMEOUT = 180 +RETRYABLE_ARXIV_STATUSES = {429, 500, 502, 503, 504} def _download_file(url: str, path: str) -> None: @@ -137,19 +138,31 @@ def _retrieve_raw_papers(self) -> list[ArxivResult]: batch_retry_delay = 30 for i in range(0, len(all_paper_ids), 20): search = arxiv.Search(id_list=all_paper_ids[i:i + 20]) + batch_succeeded = False for attempt in range(max_batch_retries): try: batch = list(client.results(search)) bar.update(len(batch)) raw_papers.extend(batch) + batch_succeeded = True break except arxiv.HTTPError as exc: - if exc.status == 429 and attempt < max_batch_retries - 1: + status = getattr(exc, "status", None) + if status in RETRYABLE_ARXIV_STATUSES and attempt < max_batch_retries - 1: wait = batch_retry_delay * (attempt + 1) - logger.warning(f"arXiv API 429 on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s") + logger.warning( + f"arXiv API {status} on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s" + ) sleep(wait) + elif status in RETRYABLE_ARXIV_STATUSES: + logger.warning( + f"Skipping batch {i // 20} after {max_batch_retries} retries due to arXiv API {status}" + ) + break else: raise + if not batch_succeeded: + logger.warning(f"No papers retrieved for batch {i // 20}") if i + 20 < len(all_paper_ids): sleep(3) bar.close() diff --git a/tests/retriever/test_arxiv_retriever.py b/tests/retriever/test_arxiv_retriever.py index cf1c4137a..ed6c88987 100644 --- a/tests/retriever/test_arxiv_retriever.py +++ b/tests/retriever/test_arxiv_retriever.py @@ -4,6 +4,7 @@ from types import SimpleNamespace import feedparser +import pytest from zotero_arxiv_daily.retriever.arxiv_retriever import ArxivRetriever, _run_with_hard_timeout import zotero_arxiv_daily.retriever.arxiv_retriever as arxiv_retriever @@ -88,3 +89,59 @@ def test_run_with_hard_timeout_returns_none_on_failure(monkeypatch): ) assert result is None assert "boom" in warnings[0] + + +def test_retrieve_raw_papers_skips_batch_after_retryable_http_error(config, mock_feedparser, monkeypatch): + from omegaconf import open_dict + + with open_dict(config): + config.executor.debug = True + + monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None) + warnings: list[str] = [] + monkeypatch.setattr(arxiv_retriever, "logger", SimpleNamespace(warning=warnings.append)) + + class FakeHTTPError(Exception): + def __init__(self, status): + self.status = status + + class FakeClient: + def __init__(self, **kw): + pass + + def results(self, search): + raise FakeHTTPError(503) + + monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError) + monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient) + + retriever = ArxivRetriever(config) + assert retriever._retrieve_raw_papers() == [] + assert any("Skipping batch 0 after 5 retries due to arXiv API 503" in warning for warning in warnings) + + +def test_retrieve_raw_papers_raises_non_retryable_http_error(config, mock_feedparser, monkeypatch): + from omegaconf import open_dict + + with open_dict(config): + config.executor.debug = True + + monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None) + + class FakeHTTPError(Exception): + def __init__(self, status): + self.status = status + + class FakeClient: + def __init__(self, **kw): + pass + + def results(self, search): + raise FakeHTTPError(400) + + monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError) + monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient) + + retriever = ArxivRetriever(config) + with pytest.raises(FakeHTTPError): + retriever._retrieve_raw_papers()