Merge pull request #2 from danielebarbaro/feat/stop-words

ezimuel · web-flow · commit f9a0319a09a6 · 2026-03-23T18:51:53.000+01:00
Add multi-language stop words with StopWordsProviderInterface
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,13 +9,29 @@ on:
 permissions:
   contents: read
 
-jobs:
-  build:
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 
+jobs:
+  test:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        php: ['8.2', '8.3', '8.4', '8.5']
+
+    name: PHP ${{ matrix.php }}
 
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout
+      uses: actions/checkout@v5
+
+    - name: Setup PHP
+      uses: shivammathur/setup-php@v2
+      with:
+        php-version: ${{ matrix.php }}
+        coverage: pcov
 
     - name: Validate composer.json and composer.lock
       run: composer validate --strict
@@ -37,3 +53,11 @@ jobs:
 
     - name: Run test suite
       run: composer run-script test
+
+    - name: Run test suite with coverage
+      run: ./vendor/bin/phpunit --coverage-clover coverage.xml
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v4
+      with:
+        files: coverage.xml
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,7 @@ composer.lock
 
 # PHPUnit
 .phpunit.result.cache
-.phpunit.cache/
+.phpunit.cache/
+
+# Coverage
+.coverage/
diff --git a/README.md b/README.md
@@ -291,6 +291,62 @@ $db = VectorDatabase::open('/var/data/mydb', new HNSWConfig(M: 32));
 $results = $db->vectorSearch($queryVector, k: 10);
 ```
 
+## Multi-language stop words
+
+Stop words are provided via `StopWordsProviderInterface`. Built-in providers:
+
+```php
+use PHPVector\BM25\SimpleTokenizer;
+use PHPVector\BM25\StopWords\EnglishStopWords;
+use PHPVector\BM25\StopWords\ItalianStopWords;
+use PHPVector\BM25\StopWords\FileStopWords;
+use PHPVector\VectorDatabase;
+
+// English (default)
+$db = new VectorDatabase();
+
+// Italian
+$db = new VectorDatabase(
+    tokenizer: new SimpleTokenizer(new ItalianStopWords()),
+);
+
+// Load from file (one word per line, # for comments)
+$db = new VectorDatabase(
+    tokenizer: new SimpleTokenizer(new FileStopWords('/path/to/stopwords.txt')),
+);
+
+### Stop words file format (`FileStopWords`)
+
+Use a plain UTF-8 text file with one stop word per line.
+
+Rules:
+- Empty lines are ignored
+- Lines starting with `#` are treated as comments
+- Words are normalized to lowercase when loaded
+
+Example (`stopwords-it.txt`):
+
+```txt
+# Italian stop words
+e
+di
+a
+che
+il
+la
+```
+
+// No stop words
+$db = new VectorDatabase(
+    tokenizer: new SimpleTokenizer(stopWords: []),
+);
+```
+
+Available providers:
+- `EnglishStopWords` - English stop words (default)
+- `ItalianStopWords` - Italian stop words
+- `FileStopWords` - Load from file
+
 ## Custom tokenizer
 
 Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic.
diff --git a/composer.json b/composer.json
@@ -4,7 +4,7 @@
     "type": "library",
     "license": "MIT",
     "require": {
-        "php": "^8.1"
+        "php": "^8.2"
     },
     "require-dev": {
         "phpunit/phpunit": "^11.0",
@@ -25,6 +25,9 @@
         "test" : [
             "vendor/bin/phpunit"
         ],
+        "coverage": [
+            "vendor/bin/phpunit --coverage-html .coverage/"
+        ],
         "phpstan": [
             "vendor/bin/phpstan analyse -l 5 src tests benchmark"
         ]
diff --git a/src/BM25/SimpleTokenizer.php b/src/BM25/SimpleTokenizer.php
@@ -4,6 +4,9 @@
 
 namespace PHPVector\BM25;
 
+use PHPVector\BM25\StopWords\EnglishStopWords;
+use PHPVector\BM25\StopWords\StopWordsProviderInterface;
+
 /**
  * A lightweight, language-agnostic tokenizer.
  *
@@ -19,15 +22,19 @@ final class SimpleTokenizer implements TokenizerInterface
     private readonly array $stopWords;
 
     /**
-     * @param string[] $stopWords     Words to discard (case-insensitive).
-     * @param int      $minTokenLength Minimum token length to keep (default: 2).
+     * @param StopWordsProviderInterface|string[] $stopWords Stop words provider or array of words.
+     * @param int $minTokenLength Minimum token length to keep (default: 2).
      */
     public function __construct(
-        array $stopWords = self::DEFAULT_STOP_WORDS,
+        StopWordsProviderInterface|array $stopWords = new EnglishStopWords(),
         private readonly int $minTokenLength = 2,
     ) {
+        $words = $stopWords instanceof StopWordsProviderInterface
+            ? $stopWords->getStopWords()
+            : $stopWords;
+
         $this->stopWords = array_fill_keys(
-            array_map('mb_strtolower', $stopWords),
+            array_map('mb_strtolower', $words),
             true,
         );
     }
@@ -49,25 +56,4 @@ public function tokenize(string $text): array
         }
         return $result;
     }
-
-    /**
-     * Common English stop words.
-     * Replace or extend via the constructor for other languages or domains.
-     */
-    public const DEFAULT_STOP_WORDS = [
-        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
-        'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
-        'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 
-        'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
-        'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 
-        'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
-        'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
-        'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
-        'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
-        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
-        'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
-        'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
-        'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
-        'don', 'should', 'now'
-    ];
 }
diff --git a/src/BM25/StopWords/EnglishStopWords.php b/src/BM25/StopWords/EnglishStopWords.php
@@ -0,0 +1,114 @@
+<?php
+
+declare(strict_types=1);
+
+namespace PHPVector\BM25\StopWords;
+
+/**
+ * English stop words provider.
+ *
+ * Includes articles, prepositions, pronouns, conjunctions, auxiliary verbs,
+ * and other high-frequency words that carry little semantic value.
+ */
+final class EnglishStopWords implements StopWordsProviderInterface
+{
+    public function getStopWords(): array
+    {
+        return self::WORDS;
+    }
+
+    /**
+     * Static access for use without instantiation.
+     *
+     * @return string[]
+     */
+    public static function words(): array
+    {
+        return self::WORDS;
+    }
+
+    private const WORDS = [
+        // Articles
+        'a', 'an', 'the',
+
+        // Prepositions
+        'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
+        'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
+        'in', 'out', 'on', 'off', 'over', 'under', 'upon', 'within', 'without',
+        'along', 'among', 'around', 'across', 'behind', 'beyond', 'near', 'toward', 'towards',
+
+        // Personal pronouns (subject)
+        'i', 'you', 'he', 'she', 'it', 'we', 'they',
+
+        // Personal pronouns (object)
+        'me', 'him', 'her', 'us', 'them',
+
+        // Possessive adjectives and pronouns
+        'my', 'your', 'his', 'her', 'its', 'our', 'their',
+        'mine', 'yours', 'hers', 'ours', 'theirs',
+
+        // Reflexive pronouns
+        'myself', 'yourself', 'yourselves', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
+
+        // Demonstrative pronouns
+        'this', 'that', 'these', 'those',
+
+        // Interrogative and relative pronouns
+        'what', 'which', 'who', 'whom', 'whose',
+
+        // Indefinite pronouns and determiners
+        'all', 'any', 'both', 'each', 'every', 'few', 'many', 'more', 'most',
+        'other', 'others', 'some', 'such', 'none', 'several',
+        'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything',
+        'nobody', 'nothing', 'somebody', 'someone', 'something',
+
+        // Coordinating conjunctions
+        'and', 'but', 'or', 'nor', 'yet', 'so', 'for',
+
+        // Subordinating conjunctions
+        'if', 'because', 'as', 'until', 'while', 'although', 'though', 'unless',
+        'since', 'when', 'where', 'whether', 'whereas', 'wherever', 'whenever',
+
+        // Verb "to be" (all forms)
+        'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+
+        // Verb "to have" (all forms)
+        'have', 'has', 'had', 'having',
+
+        // Verb "to do" (all forms)
+        'do', 'does', 'did', 'doing', 'done',
+
+        // Modal verbs
+        'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
+
+        // Common auxiliary forms
+        'get', 'gets', 'got', 'getting',
+        'let', 'lets',
+
+        // Adverbs of place
+        'here', 'there', 'where', 'anywhere', 'everywhere', 'somewhere', 'nowhere',
+
+        // Adverbs of time
+        'now', 'then', 'when', 'always', 'never', 'often', 'sometimes', 'usually',
+        'already', 'still', 'yet', 'again', 'once', 'ever',
+        'before', 'after', 'soon', 'later', 'recently', 'today', 'yesterday', 'tomorrow',
+
+        // Adverbs of degree
+        'very', 'too', 'quite', 'rather', 'almost', 'enough', 'just', 'only',
+        'even', 'also', 'well', 'much', 'more', 'most', 'less', 'least',
+
+        // Other common adverbs
+        'how', 'why', 'further', 'back',
+
+        // Negation
+        'no', 'not',
+
+        // Other function words
+        'own', 'same', 'than', 'like', 'per', 'via',
+
+        // Contractions (tokenizer splits on apostrophes, leaving these fragments)
+        's', 't', 'd', 'm', 've', 'll', 're',
+        'don', 'doesn', 'didn', 'won', 'wouldn', 'shouldn', 'couldn', 'can',
+        'hasn', 'haven', 'hadn', 'isn', 'aren', 'wasn', 'weren', 'ain',
+    ];
+}
diff --git a/src/BM25/StopWords/FileStopWords.php b/src/BM25/StopWords/FileStopWords.php
@@ -0,0 +1,85 @@
+<?php
+
+declare(strict_types=1);
+
+namespace PHPVector\BM25\StopWords;
+
+/**
+ * Load stop words from a file.
+ *
+ * This class demonstrates the value of the StopWordsProviderInterface:
+ * stop words can come from any source, not just hardcoded arrays.
+ *
+ * Expected file format:
+ * - One word per line
+ * - Empty lines and lines starting with # are ignored
+ * - Words are automatically lowercased
+ *
+ * Example file:
+ * ```
+ * # English stop words
+ * the
+ * a
+ * an
+ * is
+ * are
+ * ```
+ */
+final class FileStopWords implements StopWordsProviderInterface
+{
+    /** @var string[]|null Cached stop words (loaded once) */
+    private ?array $words = null;
+
+    /**
+     * @param string $filePath Path to the stop words file.
+     * @throws \InvalidArgumentException if the file does not exist.
+     */
+    public function __construct(
+        private readonly string $filePath,
+    ) {
+        if (!file_exists($filePath)) {
+            throw new \InvalidArgumentException(
+                sprintf('Stop words file not found: %s', $filePath)
+            );
+        }
+    }
+
+    public function getStopWords(): array
+    {
+        if ($this->words !== null) {
+            return $this->words;
+        }
+
+        $this->words = $this->loadFromFile();
+        return $this->words;
+    }
+
+    /**
+     * @return string[]
+     */
+    private function loadFromFile(): array
+    {
+        $content = @file_get_contents($this->filePath);
+        if ($content === false) {
+            throw new \RuntimeException(
+                sprintf('Failed to read stop words file: %s', $this->filePath)
+            );
+        }
+
+        $lines = explode("\n", $content);
+        $words = [];
+
+        foreach ($lines as $line) {
+            $line = trim($line);
+
+            // Skip empty lines and comments
+            if ($line === '' || str_starts_with($line, '#')) {
+                continue;
+            }
+
+            $words[] = mb_strtolower($line, 'UTF-8');
+        }
+
+        return $words;
+    }
+}
diff --git a/src/BM25/StopWords/ItalianStopWords.php b/src/BM25/StopWords/ItalianStopWords.php
diff --git a/src/BM25/StopWords/StopWordsProviderInterface.php b/src/BM25/StopWords/StopWordsProviderInterface.php
diff --git a/tests/BM25/StopWordsTest.php b/tests/BM25/StopWordsTest.php