Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/zotero_arxiv_daily/retriever/arxiv_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
DOWNLOAD_TIMEOUT = (10, 60)
PDF_EXTRACT_TIMEOUT = 180
TAR_EXTRACT_TIMEOUT = 180
RETRYABLE_ARXIV_STATUSES = {429, 500, 502, 503, 504}


def _download_file(url: str, path: str) -> None:
Expand Down Expand Up @@ -137,19 +138,31 @@ def _retrieve_raw_papers(self) -> list[ArxivResult]:
batch_retry_delay = 30
for i in range(0, len(all_paper_ids), 20):
search = arxiv.Search(id_list=all_paper_ids[i:i + 20])
batch_succeeded = False
for attempt in range(max_batch_retries):
try:
batch = list(client.results(search))
bar.update(len(batch))
raw_papers.extend(batch)
batch_succeeded = True
break
except arxiv.HTTPError as exc:
if exc.status == 429 and attempt < max_batch_retries - 1:
status = getattr(exc, "status", None)
if status in RETRYABLE_ARXIV_STATUSES and attempt < max_batch_retries - 1:
wait = batch_retry_delay * (attempt + 1)
logger.warning(f"arXiv API 429 on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s")
logger.warning(
f"arXiv API {status} on batch {i // 20}, retry {attempt + 1}/{max_batch_retries} in {wait}s"
)
sleep(wait)
elif status in RETRYABLE_ARXIV_STATUSES:
logger.warning(
f"Skipping batch {i // 20} after {max_batch_retries} retries due to arXiv API {status}"
)
break
else:
raise
if not batch_succeeded:
logger.warning(f"No papers retrieved for batch {i // 20}")
Comment on lines +157 to +165
if i + 20 < len(all_paper_ids):
sleep(3)
bar.close()
Expand Down
57 changes: 57 additions & 0 deletions tests/retriever/test_arxiv_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from types import SimpleNamespace

import feedparser
import pytest

from zotero_arxiv_daily.retriever.arxiv_retriever import ArxivRetriever, _run_with_hard_timeout
import zotero_arxiv_daily.retriever.arxiv_retriever as arxiv_retriever
Expand Down Expand Up @@ -88,3 +89,59 @@ def test_run_with_hard_timeout_returns_none_on_failure(monkeypatch):
)
assert result is None
assert "boom" in warnings[0]


def test_retrieve_raw_papers_skips_batch_after_retryable_http_error(config, mock_feedparser, monkeypatch):
from omegaconf import open_dict

with open_dict(config):
config.executor.debug = True

monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None)
warnings: list[str] = []
monkeypatch.setattr(arxiv_retriever, "logger", SimpleNamespace(warning=warnings.append))

class FakeHTTPError(Exception):
def __init__(self, status):
self.status = status

class FakeClient:
def __init__(self, **kw):
pass

def results(self, search):
raise FakeHTTPError(503)

monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError)
monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient)

retriever = ArxivRetriever(config)
assert retriever._retrieve_raw_papers() == []
assert any("Skipping batch 0 after 5 retries due to arXiv API 503" in warning for warning in warnings)


def test_retrieve_raw_papers_raises_non_retryable_http_error(config, mock_feedparser, monkeypatch):
from omegaconf import open_dict

with open_dict(config):
config.executor.debug = True

monkeypatch.setattr(arxiv_retriever, "sleep", lambda _: None)

class FakeHTTPError(Exception):
def __init__(self, status):
self.status = status

class FakeClient:
def __init__(self, **kw):
pass

def results(self, search):
raise FakeHTTPError(400)

monkeypatch.setattr(arxiv_retriever.arxiv, "HTTPError", FakeHTTPError)
monkeypatch.setattr(arxiv_retriever.arxiv, "Client", FakeClient)

retriever = ArxivRetriever(config)
with pytest.raises(FakeHTTPError):
retriever._retrieve_raw_papers()