Skip to content

Commit 0db4eb5

Browse files
authored
Merge pull request #204 from TideDra/fix-pr-192-include-path-list-only
Require include_path to be a list
2 parents 168f2cb + da78e3b commit 0db4eb5

5 files changed

Lines changed: 89 additions & 11 deletions

File tree

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
- List of papers sorted by relevance with your recent research interest.
4040
- Fast deployment via fork this repo and set environment variables in the Github Action Page.
4141
- Support LLM API for generating TL;DR of papers.
42-
- Ignore unwanted Zotero papers using glob pattern.
42+
- Ignore unwanted Zotero papers using a list of glob patterns.
4343
- Support multiple sources of papers to retrieve:
4444
- arxiv
4545
- biorxiv
@@ -76,7 +76,7 @@ Paste the following content into the value of `CUSTOM_CONFIG` variable:
7676
zotero:
7777
user_id: ${oc.env:ZOTERO_ID}
7878
api_key: ${oc.env:ZOTERO_KEY}
79-
include_path: null
79+
include_path: null # Or e.g. ["2026/survey/**", "2026/reading-group/**"]
8080

8181
email:
8282
sender: ${oc.env:SENDER}
@@ -110,7 +110,7 @@ Here is the full configuration, `???` means the value must be filled in:
110110
zotero:
111111
user_id: ??? # User ID of your Zotero account.
112112
api_key: ??? # An Zotero API key with read access.
113-
include_path: null # A glob pattern marking the Zotero collections that should be included. Example: "2026/survey/**"
113+
include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**", "2026/reading-group/**"]
114114
115115
source:
116116
arxiv:

config/base.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
zotero:
22
user_id: ??? # User ID of your Zotero account.
33
api_key: ??? # An Zotero API key with read access.
4-
include_path: null # A glob pattern marking the Zotero collections that should be included. Example: "2026/survey/**"
4+
include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**","2026/reading-group/**"]
55

66
source:
77
arxiv:

config/custom.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
zotero:
22
user_id: ${oc.env:ZOTERO_ID}
33
api_key: ${oc.env:ZOTERO_KEY}
4-
include_path: null
4+
include_path: null # Or set a list of glob patterns, e.g. ["2026/survey/**","2026/reading-group/**"]
55

66
email:
77
sender: ${oc.env:SENDER}
@@ -23,4 +23,4 @@ source:
2323

2424
executor:
2525
debug: ${oc.env:DEBUG,null}
26-
source: ['arxiv']
26+
source: ['arxiv']

src/zotero_arxiv_daily/executor.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from loguru import logger
22
from pyzotero import zotero
3-
from omegaconf import DictConfig
3+
from omegaconf import DictConfig, ListConfig
44
from .utils import glob_match
55
from .retriever import get_retriever_cls
66
from .protocol import CorpusPaper
@@ -11,9 +11,28 @@
1111
from .utils import send_email
1212
from openai import OpenAI
1313
from tqdm import tqdm
14+
15+
16+
def normalize_include_path_patterns(include_path: list[str] | ListConfig | None) -> list[str] | None:
17+
if include_path is None:
18+
return None
19+
20+
if not isinstance(include_path, (list, ListConfig)):
21+
raise TypeError(
22+
"config.zotero.include_path must be a list of glob patterns or null, "
23+
'for example ["2026/survey/**"]. Single strings are not supported.'
24+
)
25+
26+
if any(not isinstance(pattern, str) for pattern in include_path):
27+
raise TypeError("config.zotero.include_path must contain only glob pattern strings.")
28+
29+
return list(include_path)
30+
31+
1432
class Executor:
1533
def __init__(self, config:DictConfig):
1634
self.config = config
35+
self.include_path_patterns = normalize_include_path_patterns(config.zotero.include_path)
1736
self.retrievers = {
1837
source: get_retriever_cls(source)(config) for source in config.executor.source
1938
}
@@ -43,12 +62,16 @@ def get_collection_path(col_key:str) -> str:
4362
) for c in corpus]
4463

4564
def filter_corpus(self, corpus:list[CorpusPaper]) -> list[CorpusPaper]:
46-
if not self.config.zotero.include_path:
65+
if not self.include_path_patterns:
4766
return corpus
4867
new_corpus = []
49-
logger.info(f"Selecting zotero papers matching include_path: {self.config.zotero.include_path}")
68+
logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
5069
for c in corpus:
51-
match_results = [glob_match(p, self.config.zotero.include_path) for p in c.paths]
70+
match_results = [
71+
glob_match(path, pattern)
72+
for path in c.paths
73+
for pattern in self.include_path_patterns
74+
]
5275
if any(match_results):
5376
new_corpus.append(c)
5477
samples = random.sample(new_corpus, min(5, len(new_corpus)))
@@ -88,4 +111,4 @@ def run(self):
88111
logger.info("Sending email...")
89112
email_content = render_email(reranked_papers)
90113
send_email(self.config, email_content)
91-
logger.info("Email sent successfully")
114+
logger.info("Email sent successfully")

tests/test_include_path.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from datetime import datetime
2+
from types import SimpleNamespace
3+
4+
import pytest
5+
from omegaconf import OmegaConf
6+
7+
from zotero_arxiv_daily.executor import Executor, normalize_include_path_patterns
8+
from zotero_arxiv_daily.protocol import CorpusPaper
9+
10+
11+
def test_normalize_include_path_patterns_rejects_single_string():
12+
with pytest.raises(TypeError, match="config.zotero.include_path must be a list of glob patterns or null"):
13+
normalize_include_path_patterns("2026/survey/**")
14+
15+
16+
def test_normalize_include_path_patterns_accepts_list_config():
17+
include_path = OmegaConf.create(["2026/survey/**", "2026/reading-group/**"])
18+
19+
assert normalize_include_path_patterns(include_path) == [
20+
"2026/survey/**",
21+
"2026/reading-group/**",
22+
]
23+
24+
25+
def test_filter_corpus_matches_any_path_against_any_pattern():
26+
executor = Executor.__new__(Executor)
27+
executor.config = SimpleNamespace(
28+
zotero=SimpleNamespace(include_path=["2026/survey/**", "2026/reading-group/**"])
29+
)
30+
executor.include_path_patterns = normalize_include_path_patterns(executor.config.zotero.include_path)
31+
32+
corpus = [
33+
CorpusPaper(
34+
title="Survey Paper",
35+
abstract="",
36+
added_date=datetime(2026, 1, 1),
37+
paths=["2026/survey/topic-a", "archive/misc"],
38+
),
39+
CorpusPaper(
40+
title="Reading Group Paper",
41+
abstract="",
42+
added_date=datetime(2026, 1, 2),
43+
paths=["notes/inbox", "2026/reading-group/week-1"],
44+
),
45+
CorpusPaper(
46+
title="Excluded Paper",
47+
abstract="",
48+
added_date=datetime(2026, 1, 3),
49+
paths=["2025/other/topic"],
50+
),
51+
]
52+
53+
filtered = executor.filter_corpus(corpus)
54+
55+
assert [paper.title for paper in filtered] == ["Survey Paper", "Reading Group Paper"]

0 commit comments

Comments
 (0)