Merge pull request #219 from TideDra/copilot/add-ignore-path-option

TideDra · web-flow · commit 91632d71b4b0 · 2026-04-01T14:31:24.000+08:00
Add `ignore_path` option to exclude Zotero papers by collection path
diff --git a/config/base.yaml b/config/base.yaml
@@ -2,6 +2,7 @@ zotero:
   user_id: ??? # User ID of your Zotero account.
   api_key: ??? # An Zotero API key with read access.
   include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**","2026/reading-group/**"]
+  ignore_path: null # A list of glob patterns marking the Zotero collections that should be excluded. Example: ["2026/ignore/**","archive/**"]
 
 source:
   arxiv:
diff --git a/src/zotero_arxiv_daily/executor.py b/src/zotero_arxiv_daily/executor.py
@@ -13,26 +13,27 @@
 from tqdm import tqdm
 
 
-def normalize_include_path_patterns(include_path: list[str] | ListConfig | None) -> list[str] | None:
-    if include_path is None:
+def normalize_path_patterns(patterns: list[str] | ListConfig | None, config_key: str) -> list[str] | None:
+    if patterns is None:
         return None
 
-    if not isinstance(include_path, (list, ListConfig)):
+    if not isinstance(patterns, (list, ListConfig)):
         raise TypeError(
-            "config.zotero.include_path must be a list of glob patterns or null, "
+            f"config.zotero.{config_key} must be a list of glob patterns or null, "
             'for example ["2026/survey/**"]. Single strings are not supported.'
         )
 
-    if any(not isinstance(pattern, str) for pattern in include_path):
-        raise TypeError("config.zotero.include_path must contain only glob pattern strings.")
+    if any(not isinstance(pattern, str) for pattern in patterns):
+        raise TypeError(f"config.zotero.{config_key} must contain only glob pattern strings.")
 
-    return list(include_path)
+    return list(patterns)
 
 
 class Executor:
     def __init__(self, config:DictConfig):
         self.config = config
-        self.include_path_patterns = normalize_include_path_patterns(config.zotero.include_path)
+        self.include_path_patterns = normalize_path_patterns(config.zotero.include_path, "include_path")
+        self.ignore_path_patterns = normalize_path_patterns(config.zotero.ignore_path, "ignore_path")
         self.retrievers = {
             source: get_retriever_cls(source)(config) for source in config.executor.source
         }
@@ -62,22 +63,31 @@ def get_collection_path(col_key:str) -> str:
         ) for c in corpus]
     
     def filter_corpus(self, corpus:list[CorpusPaper]) -> list[CorpusPaper]:
-        if not self.include_path_patterns:
-            return corpus
-        new_corpus = []
-        logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
-        for c in corpus:
-            match_results = [
-                glob_match(path, pattern)
-                for path in c.paths
-                for pattern in self.include_path_patterns
+        if self.include_path_patterns:
+            logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
+            corpus = [
+                c for c in corpus
+                if any(
+                    glob_match(path, pattern)
+                    for path in c.paths
+                    for pattern in self.include_path_patterns
+                )
+            ]
+        if self.ignore_path_patterns:
+            logger.info(f"Excluding zotero papers matching ignore_path: {self.ignore_path_patterns}")
+            corpus = [
+                c for c in corpus
+                if not any(
+                    glob_match(path, pattern)
+                    for path in c.paths
+                    for pattern in self.ignore_path_patterns
+                )
             ]
-            if any(match_results):
-                new_corpus.append(c)
-        samples = random.sample(new_corpus, min(5, len(new_corpus)))
-        samples = '\n'.join([c.title + ' - ' + '\n'.join(c.paths) for c in samples])
-        logger.info(f"Selected {len(new_corpus)} zotero papers:\n{samples}\n...")
-        return new_corpus
+        if self.include_path_patterns or self.ignore_path_patterns:
+            samples = random.sample(corpus, min(5, len(corpus)))
+            samples = '\n'.join([c.title + ' - ' + '\n'.join(c.paths) for c in samples])
+            logger.info(f"Selected {len(corpus)} zotero papers:\n{samples}\n...")
+        return corpus
 
     
     def run(self):
diff --git a/tests/test_include_path.py b/tests/test_include_path.py
@@ -4,19 +4,19 @@
 import pytest
 from omegaconf import OmegaConf
 
-from zotero_arxiv_daily.executor import Executor, normalize_include_path_patterns
+from zotero_arxiv_daily.executor import Executor, normalize_path_patterns
 from zotero_arxiv_daily.protocol import CorpusPaper
 
 
-def test_normalize_include_path_patterns_rejects_single_string():
+def test_normalize_path_patterns_rejects_single_string_for_include_path():
     with pytest.raises(TypeError, match="config.zotero.include_path must be a list of glob patterns or null"):
-        normalize_include_path_patterns("2026/survey/**")
+        normalize_path_patterns("2026/survey/**", "include_path")
 
 
-def test_normalize_include_path_patterns_accepts_list_config():
+def test_normalize_path_patterns_accepts_list_config_for_include_path():
     include_path = OmegaConf.create(["2026/survey/**", "2026/reading-group/**"])
 
-    assert normalize_include_path_patterns(include_path) == [
+    assert normalize_path_patterns(include_path, "include_path") == [
         "2026/survey/**",
         "2026/reading-group/**",
     ]
@@ -27,7 +27,8 @@ def test_filter_corpus_matches_any_path_against_any_pattern():
     executor.config = SimpleNamespace(
         zotero=SimpleNamespace(include_path=["2026/survey/**", "2026/reading-group/**"])
     )
-    executor.include_path_patterns = normalize_include_path_patterns(executor.config.zotero.include_path)
+    executor.include_path_patterns = normalize_path_patterns(executor.config.zotero.include_path, "include_path")
+    executor.ignore_path_patterns = None
 
     corpus = [
         CorpusPaper(
@@ -53,3 +54,90 @@ def test_filter_corpus_matches_any_path_against_any_pattern():
     filtered = executor.filter_corpus(corpus)
 
     assert [paper.title for paper in filtered] == ["Survey Paper", "Reading Group Paper"]
+
+
+def test_normalize_path_patterns_rejects_single_string_for_ignore_path():
+    with pytest.raises(TypeError, match="config.zotero.ignore_path must be a list of glob patterns or null"):
+        normalize_path_patterns("archive/**", "ignore_path")
+
+
+def test_normalize_path_patterns_accepts_list_config_for_ignore_path():
+    ignore_path = OmegaConf.create(["archive/**", "2025/**"])
+
+    assert normalize_path_patterns(ignore_path, "ignore_path") == ["archive/**", "2025/**"]
+
+
+def test_normalize_path_patterns_accepts_empty_list():
+    assert normalize_path_patterns([], "ignore_path") == []
+
+
+def test_filter_corpus_excludes_papers_matching_ignore_path():
+    executor = Executor.__new__(Executor)
+    executor.include_path_patterns = None
+    executor.ignore_path_patterns = normalize_path_patterns(["archive/**", "2025/**"], "ignore_path")
+
+    corpus = [
+        CorpusPaper(
+            title="Active Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 1),
+            paths=["2026/survey/topic-a"],
+        ),
+        CorpusPaper(
+            title="Archived Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 2),
+            paths=["archive/misc"],
+        ),
+        CorpusPaper(
+            title="Old Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 3),
+            paths=["2025/other/topic"],
+        ),
+    ]
+
+    filtered = executor.filter_corpus(corpus)
+
+    assert [paper.title for paper in filtered] == ["Active Paper"]
+
+
+def test_filter_corpus_ignore_path_takes_precedence_over_include_path():
+    """Papers matching both include_path and ignore_path should be excluded."""
+    executor = Executor.__new__(Executor)
+    executor.include_path_patterns = normalize_path_patterns(["2026/**"], "include_path")
+    executor.ignore_path_patterns = normalize_path_patterns(["2026/ignore/**"], "ignore_path")
+
+    corpus = [
+        CorpusPaper(
+            title="Included Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 1),
+            paths=["2026/survey/topic-a"],
+        ),
+        CorpusPaper(
+            title="Ignored Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 2),
+            paths=["2026/ignore/topic-b"],
+        ),
+    ]
+
+    filtered = executor.filter_corpus(corpus)
+
+    assert [paper.title for paper in filtered] == ["Included Paper"]
+
+
+def test_filter_corpus_no_filters_returns_all():
+    executor = Executor.__new__(Executor)
+    executor.include_path_patterns = None
+    executor.ignore_path_patterns = None
+
+    corpus = [
+        CorpusPaper(title="Paper A", abstract="", added_date=datetime(2026, 1, 1), paths=["foo"]),
+        CorpusPaper(title="Paper B", abstract="", added_date=datetime(2026, 1, 2), paths=["bar"]),
+    ]
+
+    filtered = executor.filter_corpus(corpus)
+
+    assert filtered == corpus