Skip to content

Commit 91632d7

Browse files
authored
Merge pull request #219 from TideDra/copilot/add-ignore-path-option
Add `ignore_path` option to exclude Zotero papers by collection path
2 parents 89d7347 + 768c98b commit 91632d7

3 files changed

Lines changed: 128 additions & 29 deletions

File tree

config/base.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ zotero:
22
user_id: ??? # User ID of your Zotero account.
33
api_key: ??? # An Zotero API key with read access.
44
include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**","2026/reading-group/**"]
5+
ignore_path: null # A list of glob patterns marking the Zotero collections that should be excluded. Example: ["2026/ignore/**","archive/**"]
56

67
source:
78
arxiv:

src/zotero_arxiv_daily/executor.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,27 @@
1313
from tqdm import tqdm
1414

1515

16-
def normalize_include_path_patterns(include_path: list[str] | ListConfig | None) -> list[str] | None:
17-
if include_path is None:
16+
def normalize_path_patterns(patterns: list[str] | ListConfig | None, config_key: str) -> list[str] | None:
17+
if patterns is None:
1818
return None
1919

20-
if not isinstance(include_path, (list, ListConfig)):
20+
if not isinstance(patterns, (list, ListConfig)):
2121
raise TypeError(
22-
"config.zotero.include_path must be a list of glob patterns or null, "
22+
f"config.zotero.{config_key} must be a list of glob patterns or null, "
2323
'for example ["2026/survey/**"]. Single strings are not supported.'
2424
)
2525

26-
if any(not isinstance(pattern, str) for pattern in include_path):
27-
raise TypeError("config.zotero.include_path must contain only glob pattern strings.")
26+
if any(not isinstance(pattern, str) for pattern in patterns):
27+
raise TypeError(f"config.zotero.{config_key} must contain only glob pattern strings.")
2828

29-
return list(include_path)
29+
return list(patterns)
3030

3131

3232
class Executor:
3333
def __init__(self, config:DictConfig):
3434
self.config = config
35-
self.include_path_patterns = normalize_include_path_patterns(config.zotero.include_path)
35+
self.include_path_patterns = normalize_path_patterns(config.zotero.include_path, "include_path")
36+
self.ignore_path_patterns = normalize_path_patterns(config.zotero.ignore_path, "ignore_path")
3637
self.retrievers = {
3738
source: get_retriever_cls(source)(config) for source in config.executor.source
3839
}
@@ -62,22 +63,31 @@ def get_collection_path(col_key:str) -> str:
6263
) for c in corpus]
6364

6465
def filter_corpus(self, corpus:list[CorpusPaper]) -> list[CorpusPaper]:
65-
if not self.include_path_patterns:
66-
return corpus
67-
new_corpus = []
68-
logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
69-
for c in corpus:
70-
match_results = [
71-
glob_match(path, pattern)
72-
for path in c.paths
73-
for pattern in self.include_path_patterns
66+
if self.include_path_patterns:
67+
logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
68+
corpus = [
69+
c for c in corpus
70+
if any(
71+
glob_match(path, pattern)
72+
for path in c.paths
73+
for pattern in self.include_path_patterns
74+
)
75+
]
76+
if self.ignore_path_patterns:
77+
logger.info(f"Excluding zotero papers matching ignore_path: {self.ignore_path_patterns}")
78+
corpus = [
79+
c for c in corpus
80+
if not any(
81+
glob_match(path, pattern)
82+
for path in c.paths
83+
for pattern in self.ignore_path_patterns
84+
)
7485
]
75-
if any(match_results):
76-
new_corpus.append(c)
77-
samples = random.sample(new_corpus, min(5, len(new_corpus)))
78-
samples = '\n'.join([c.title + ' - ' + '\n'.join(c.paths) for c in samples])
79-
logger.info(f"Selected {len(new_corpus)} zotero papers:\n{samples}\n...")
80-
return new_corpus
86+
if self.include_path_patterns or self.ignore_path_patterns:
87+
samples = random.sample(corpus, min(5, len(corpus)))
88+
samples = '\n'.join([c.title + ' - ' + '\n'.join(c.paths) for c in samples])
89+
logger.info(f"Selected {len(corpus)} zotero papers:\n{samples}\n...")
90+
return corpus
8191

8292

8393
def run(self):

tests/test_include_path.py

Lines changed: 94 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@
44
import pytest
55
from omegaconf import OmegaConf
66

7-
from zotero_arxiv_daily.executor import Executor, normalize_include_path_patterns
7+
from zotero_arxiv_daily.executor import Executor, normalize_path_patterns
88
from zotero_arxiv_daily.protocol import CorpusPaper
99

1010

11-
def test_normalize_include_path_patterns_rejects_single_string():
11+
def test_normalize_path_patterns_rejects_single_string_for_include_path():
1212
with pytest.raises(TypeError, match="config.zotero.include_path must be a list of glob patterns or null"):
13-
normalize_include_path_patterns("2026/survey/**")
13+
normalize_path_patterns("2026/survey/**", "include_path")
1414

1515

16-
def test_normalize_include_path_patterns_accepts_list_config():
16+
def test_normalize_path_patterns_accepts_list_config_for_include_path():
1717
include_path = OmegaConf.create(["2026/survey/**", "2026/reading-group/**"])
1818

19-
assert normalize_include_path_patterns(include_path) == [
19+
assert normalize_path_patterns(include_path, "include_path") == [
2020
"2026/survey/**",
2121
"2026/reading-group/**",
2222
]
@@ -27,7 +27,8 @@ def test_filter_corpus_matches_any_path_against_any_pattern():
2727
executor.config = SimpleNamespace(
2828
zotero=SimpleNamespace(include_path=["2026/survey/**", "2026/reading-group/**"])
2929
)
30-
executor.include_path_patterns = normalize_include_path_patterns(executor.config.zotero.include_path)
30+
executor.include_path_patterns = normalize_path_patterns(executor.config.zotero.include_path, "include_path")
31+
executor.ignore_path_patterns = None
3132

3233
corpus = [
3334
CorpusPaper(
@@ -53,3 +54,90 @@ def test_filter_corpus_matches_any_path_against_any_pattern():
5354
filtered = executor.filter_corpus(corpus)
5455

5556
assert [paper.title for paper in filtered] == ["Survey Paper", "Reading Group Paper"]
57+
58+
59+
def test_normalize_path_patterns_rejects_single_string_for_ignore_path():
60+
with pytest.raises(TypeError, match="config.zotero.ignore_path must be a list of glob patterns or null"):
61+
normalize_path_patterns("archive/**", "ignore_path")
62+
63+
64+
def test_normalize_path_patterns_accepts_list_config_for_ignore_path():
65+
ignore_path = OmegaConf.create(["archive/**", "2025/**"])
66+
67+
assert normalize_path_patterns(ignore_path, "ignore_path") == ["archive/**", "2025/**"]
68+
69+
70+
def test_normalize_path_patterns_accepts_empty_list():
71+
assert normalize_path_patterns([], "ignore_path") == []
72+
73+
74+
def test_filter_corpus_excludes_papers_matching_ignore_path():
75+
executor = Executor.__new__(Executor)
76+
executor.include_path_patterns = None
77+
executor.ignore_path_patterns = normalize_path_patterns(["archive/**", "2025/**"], "ignore_path")
78+
79+
corpus = [
80+
CorpusPaper(
81+
title="Active Paper",
82+
abstract="",
83+
added_date=datetime(2026, 1, 1),
84+
paths=["2026/survey/topic-a"],
85+
),
86+
CorpusPaper(
87+
title="Archived Paper",
88+
abstract="",
89+
added_date=datetime(2026, 1, 2),
90+
paths=["archive/misc"],
91+
),
92+
CorpusPaper(
93+
title="Old Paper",
94+
abstract="",
95+
added_date=datetime(2026, 1, 3),
96+
paths=["2025/other/topic"],
97+
),
98+
]
99+
100+
filtered = executor.filter_corpus(corpus)
101+
102+
assert [paper.title for paper in filtered] == ["Active Paper"]
103+
104+
105+
def test_filter_corpus_ignore_path_takes_precedence_over_include_path():
106+
"""Papers matching both include_path and ignore_path should be excluded."""
107+
executor = Executor.__new__(Executor)
108+
executor.include_path_patterns = normalize_path_patterns(["2026/**"], "include_path")
109+
executor.ignore_path_patterns = normalize_path_patterns(["2026/ignore/**"], "ignore_path")
110+
111+
corpus = [
112+
CorpusPaper(
113+
title="Included Paper",
114+
abstract="",
115+
added_date=datetime(2026, 1, 1),
116+
paths=["2026/survey/topic-a"],
117+
),
118+
CorpusPaper(
119+
title="Ignored Paper",
120+
abstract="",
121+
added_date=datetime(2026, 1, 2),
122+
paths=["2026/ignore/topic-b"],
123+
),
124+
]
125+
126+
filtered = executor.filter_corpus(corpus)
127+
128+
assert [paper.title for paper in filtered] == ["Included Paper"]
129+
130+
131+
def test_filter_corpus_no_filters_returns_all():
132+
executor = Executor.__new__(Executor)
133+
executor.include_path_patterns = None
134+
executor.ignore_path_patterns = None
135+
136+
corpus = [
137+
CorpusPaper(title="Paper A", abstract="", added_date=datetime(2026, 1, 1), paths=["foo"]),
138+
CorpusPaper(title="Paper B", abstract="", added_date=datetime(2026, 1, 2), paths=["bar"]),
139+
]
140+
141+
filtered = executor.filter_corpus(corpus)
142+
143+
assert filtered == corpus

0 commit comments

Comments
 (0)