Merge pull request #204 from TideDra/fix-pr-192-include-path-list-only

TideDra · web-flow · commit 0db4eb5ffd7b · 2026-03-18T15:58:36.000+08:00
Require include_path to be a list
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@
 - List of papers sorted by relevance with your recent research interest.
 - Fast deployment via fork this repo and set environment variables in the Github Action Page.
 - Support LLM API for generating TL;DR of papers.
-- Ignore unwanted Zotero papers using glob pattern.
+- Ignore unwanted Zotero papers using a list of glob patterns.
 - Support multiple sources of papers to retrieve:
   - arxiv
   - biorxiv
@@ -76,7 +76,7 @@ Paste the following content into the value of `CUSTOM_CONFIG` variable:
 zotero:
   user_id: ${oc.env:ZOTERO_ID}
   api_key: ${oc.env:ZOTERO_KEY}
-  include_path: null
+  include_path: null # Or e.g. ["2026/survey/**", "2026/reading-group/**"]
 
 email:
   sender: ${oc.env:SENDER}
@@ -110,7 +110,7 @@ Here is the full configuration, `???` means the value must be filled in:
 zotero:
   user_id: ??? # User ID of your Zotero account.
   api_key: ??? # An Zotero API key with read access.
-  include_path: null # A glob pattern marking the Zotero collections that should be included. Example: "2026/survey/**"
+  include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**", "2026/reading-group/**"]
 
 source:
   arxiv:
diff --git a/config/base.yaml b/config/base.yaml
@@ -1,7 +1,7 @@
 zotero:
   user_id: ??? # User ID of your Zotero account.
   api_key: ??? # An Zotero API key with read access.
-  include_path: null # A glob pattern marking the Zotero collections that should be included. Example: "2026/survey/**"
+  include_path: null # A list of glob patterns marking the Zotero collections that should be included. Example: ["2026/survey/**","2026/reading-group/**"]
 
 source:
   arxiv:
diff --git a/config/custom.yaml b/config/custom.yaml
@@ -1,7 +1,7 @@
 zotero:
   user_id: ${oc.env:ZOTERO_ID}
   api_key: ${oc.env:ZOTERO_KEY}
-  include_path: null
+  include_path: null # Or set a list of glob patterns, e.g. ["2026/survey/**","2026/reading-group/**"]
 
 email:
   sender: ${oc.env:SENDER}
@@ -23,4 +23,4 @@ source:
 
 executor:
   debug: ${oc.env:DEBUG,null}
-  source: ['arxiv']
+  source: ['arxiv']
diff --git a/src/zotero_arxiv_daily/executor.py b/src/zotero_arxiv_daily/executor.py
@@ -1,6 +1,6 @@
 from loguru import logger
 from pyzotero import zotero
-from omegaconf import DictConfig
+from omegaconf import DictConfig, ListConfig
 from .utils import glob_match
 from .retriever import get_retriever_cls
 from .protocol import CorpusPaper
@@ -11,9 +11,28 @@
 from .utils import send_email
 from openai import OpenAI
 from tqdm import tqdm
+
+
+def normalize_include_path_patterns(include_path: list[str] | ListConfig | None) -> list[str] | None:
+    if include_path is None:
+        return None
+
+    if not isinstance(include_path, (list, ListConfig)):
+        raise TypeError(
+            "config.zotero.include_path must be a list of glob patterns or null, "
+            'for example ["2026/survey/**"]. Single strings are not supported.'
+        )
+
+    if any(not isinstance(pattern, str) for pattern in include_path):
+        raise TypeError("config.zotero.include_path must contain only glob pattern strings.")
+
+    return list(include_path)
+
+
 class Executor:
     def __init__(self, config:DictConfig):
         self.config = config
+        self.include_path_patterns = normalize_include_path_patterns(config.zotero.include_path)
         self.retrievers = {
             source: get_retriever_cls(source)(config) for source in config.executor.source
         }
@@ -43,12 +62,16 @@ def get_collection_path(col_key:str) -> str:
         ) for c in corpus]
     
     def filter_corpus(self, corpus:list[CorpusPaper]) -> list[CorpusPaper]:
-        if not self.config.zotero.include_path:
+        if not self.include_path_patterns:
             return corpus
         new_corpus = []
-        logger.info(f"Selecting zotero papers matching include_path: {self.config.zotero.include_path}")
+        logger.info(f"Selecting zotero papers matching include_path: {self.include_path_patterns}")
         for c in corpus:
-            match_results = [glob_match(p, self.config.zotero.include_path) for p in c.paths]
+            match_results = [
+                glob_match(path, pattern)
+                for path in c.paths
+                for pattern in self.include_path_patterns
+            ]
             if any(match_results):
                 new_corpus.append(c)
         samples = random.sample(new_corpus, min(5, len(new_corpus)))
@@ -88,4 +111,4 @@ def run(self):
         logger.info("Sending email...")
         email_content = render_email(reranked_papers)
         send_email(self.config, email_content)
-        logger.info("Email sent successfully")
+        logger.info("Email sent successfully")
diff --git a/tests/test_include_path.py b/tests/test_include_path.py
@@ -0,0 +1,55 @@
+from datetime import datetime
+from types import SimpleNamespace
+
+import pytest
+from omegaconf import OmegaConf
+
+from zotero_arxiv_daily.executor import Executor, normalize_include_path_patterns
+from zotero_arxiv_daily.protocol import CorpusPaper
+
+
+def test_normalize_include_path_patterns_rejects_single_string():
+    with pytest.raises(TypeError, match="config.zotero.include_path must be a list of glob patterns or null"):
+        normalize_include_path_patterns("2026/survey/**")
+
+
+def test_normalize_include_path_patterns_accepts_list_config():
+    include_path = OmegaConf.create(["2026/survey/**", "2026/reading-group/**"])
+
+    assert normalize_include_path_patterns(include_path) == [
+        "2026/survey/**",
+        "2026/reading-group/**",
+    ]
+
+
+def test_filter_corpus_matches_any_path_against_any_pattern():
+    executor = Executor.__new__(Executor)
+    executor.config = SimpleNamespace(
+        zotero=SimpleNamespace(include_path=["2026/survey/**", "2026/reading-group/**"])
+    )
+    executor.include_path_patterns = normalize_include_path_patterns(executor.config.zotero.include_path)
+
+    corpus = [
+        CorpusPaper(
+            title="Survey Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 1),
+            paths=["2026/survey/topic-a", "archive/misc"],
+        ),
+        CorpusPaper(
+            title="Reading Group Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 2),
+            paths=["notes/inbox", "2026/reading-group/week-1"],
+        ),
+        CorpusPaper(
+            title="Excluded Paper",
+            abstract="",
+            added_date=datetime(2026, 1, 3),
+            paths=["2025/other/topic"],
+        ),
+    ]
+
+    filtered = executor.filter_corpus(corpus)
+
+    assert [paper.title for paper in filtered] == ["Survey Paper", "Reading Group Paper"]