Skip to content

Commit f9a0319

Browse files
authored
Merge pull request #2 from danielebarbaro/feat/stop-words
Add multi-language stop words with StopWordsProviderInterface
2 parents 8308567 + 7918353 commit f9a0319

File tree

10 files changed

+792
-30
lines changed

10 files changed

+792
-30
lines changed

.github/workflows/test.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,29 @@ on:
99
permissions:
1010
contents: read
1111

12-
jobs:
13-
build:
12+
concurrency:
13+
group: ${{ github.workflow }}-${{ github.ref }}
14+
cancel-in-progress: true
1415

16+
jobs:
17+
test:
1518
runs-on: ubuntu-latest
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
php: ['8.2', '8.3', '8.4', '8.5']
23+
24+
name: PHP ${{ matrix.php }}
1625

1726
steps:
18-
- uses: actions/checkout@v4
27+
- name: Checkout
28+
uses: actions/checkout@v5
29+
30+
- name: Setup PHP
31+
uses: shivammathur/setup-php@v2
32+
with:
33+
php-version: ${{ matrix.php }}
34+
coverage: pcov
1935

2036
- name: Validate composer.json and composer.lock
2137
run: composer validate --strict
@@ -37,3 +53,11 @@ jobs:
3753

3854
- name: Run test suite
3955
run: composer run-script test
56+
57+
- name: Run test suite with coverage
58+
run: ./vendor/bin/phpunit --coverage-clover coverage.xml
59+
60+
- name: Upload coverage to Codecov
61+
uses: codecov/codecov-action@v4
62+
with:
63+
files: coverage.xml

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,7 @@ composer.lock
44

55
# PHPUnit
66
.phpunit.result.cache
7-
.phpunit.cache/
7+
.phpunit.cache/
8+
9+
# Coverage
10+
.coverage/

README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,62 @@ $db = VectorDatabase::open('/var/data/mydb', new HNSWConfig(M: 32));
291291
$results = $db->vectorSearch($queryVector, k: 10);
292292
```
293293

294+
## Multi-language stop words
295+
296+
Stop words are provided via `StopWordsProviderInterface`. Built-in providers:
297+
298+
```php
299+
use PHPVector\BM25\SimpleTokenizer;
300+
use PHPVector\BM25\StopWords\EnglishStopWords;
301+
use PHPVector\BM25\StopWords\ItalianStopWords;
302+
use PHPVector\BM25\StopWords\FileStopWords;
303+
use PHPVector\VectorDatabase;
304+
305+
// English (default)
306+
$db = new VectorDatabase();
307+
308+
// Italian
309+
$db = new VectorDatabase(
310+
tokenizer: new SimpleTokenizer(new ItalianStopWords()),
311+
);
312+
313+
// Load from file (one word per line, # for comments)
314+
$db = new VectorDatabase(
315+
tokenizer: new SimpleTokenizer(new FileStopWords('/path/to/stopwords.txt')),
316+
);
317+
318+
### Stop words file format (`FileStopWords`)
319+
320+
Use a plain UTF-8 text file with one stop word per line.
321+
322+
Rules:
323+
- Empty lines are ignored
324+
- Lines starting with `#` are treated as comments
325+
- Words are normalized to lowercase when loaded
326+
327+
Example (`stopwords-it.txt`):
328+
329+
```txt
330+
# Italian stop words
331+
e
332+
di
333+
a
334+
che
335+
il
336+
la
337+
```
338+
339+
// No stop words
340+
$db = new VectorDatabase(
341+
tokenizer: new SimpleTokenizer(stopWords: []),
342+
);
343+
```
344+
345+
Available providers:
346+
- `EnglishStopWords` - English stop words (default)
347+
- `ItalianStopWords` - Italian stop words
348+
- `FileStopWords` - Load from file
349+
294350
## Custom tokenizer
295351
296352
Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic.

composer.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"type": "library",
55
"license": "MIT",
66
"require": {
7-
"php": "^8.1"
7+
"php": "^8.2"
88
},
99
"require-dev": {
1010
"phpunit/phpunit": "^11.0",
@@ -25,6 +25,9 @@
2525
"test" : [
2626
"vendor/bin/phpunit"
2727
],
28+
"coverage": [
29+
"vendor/bin/phpunit --coverage-html .coverage/"
30+
],
2831
"phpstan": [
2932
"vendor/bin/phpstan analyse -l 5 src tests benchmark"
3033
]

src/BM25/SimpleTokenizer.php

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
namespace PHPVector\BM25;
66

7+
use PHPVector\BM25\StopWords\EnglishStopWords;
8+
use PHPVector\BM25\StopWords\StopWordsProviderInterface;
9+
710
/**
811
* A lightweight, language-agnostic tokenizer.
912
*
@@ -19,15 +22,19 @@ final class SimpleTokenizer implements TokenizerInterface
1922
private readonly array $stopWords;
2023

2124
/**
22-
* @param string[] $stopWords Words to discard (case-insensitive).
23-
* @param int $minTokenLength Minimum token length to keep (default: 2).
25+
* @param StopWordsProviderInterface|string[] $stopWords Stop words provider or array of words.
26+
* @param int $minTokenLength Minimum token length to keep (default: 2).
2427
*/
2528
public function __construct(
26-
array $stopWords = self::DEFAULT_STOP_WORDS,
29+
StopWordsProviderInterface|array $stopWords = new EnglishStopWords(),
2730
private readonly int $minTokenLength = 2,
2831
) {
32+
$words = $stopWords instanceof StopWordsProviderInterface
33+
? $stopWords->getStopWords()
34+
: $stopWords;
35+
2936
$this->stopWords = array_fill_keys(
30-
array_map('mb_strtolower', $stopWords),
37+
array_map('mb_strtolower', $words),
3138
true,
3239
);
3340
}
@@ -49,25 +56,4 @@ public function tokenize(string $text): array
4956
}
5057
return $result;
5158
}
52-
53-
/**
54-
* Common English stop words.
55-
* Replace or extend via the constructor for other languages or domains.
56-
*/
57-
public const DEFAULT_STOP_WORDS = [
58-
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
59-
'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
60-
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
61-
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
62-
'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
63-
'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
64-
'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
65-
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
66-
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
67-
'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
68-
'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
69-
'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
70-
'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
71-
'don', 'should', 'now'
72-
];
7359
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PHPVector\BM25\StopWords;
6+
7+
/**
8+
* English stop words provider.
9+
*
10+
* Includes articles, prepositions, pronouns, conjunctions, auxiliary verbs,
11+
* and other high-frequency words that carry little semantic value.
12+
*/
13+
final class EnglishStopWords implements StopWordsProviderInterface
14+
{
15+
public function getStopWords(): array
16+
{
17+
return self::WORDS;
18+
}
19+
20+
/**
21+
* Static access for use without instantiation.
22+
*
23+
* @return string[]
24+
*/
25+
public static function words(): array
26+
{
27+
return self::WORDS;
28+
}
29+
30+
private const WORDS = [
31+
// Articles
32+
'a', 'an', 'the',
33+
34+
// Prepositions
35+
'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
36+
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
37+
'in', 'out', 'on', 'off', 'over', 'under', 'upon', 'within', 'without',
38+
'along', 'among', 'around', 'across', 'behind', 'beyond', 'near', 'toward', 'towards',
39+
40+
// Personal pronouns (subject)
41+
'i', 'you', 'he', 'she', 'it', 'we', 'they',
42+
43+
// Personal pronouns (object)
44+
'me', 'him', 'her', 'us', 'them',
45+
46+
// Possessive adjectives and pronouns
47+
'my', 'your', 'his', 'her', 'its', 'our', 'their',
48+
'mine', 'yours', 'hers', 'ours', 'theirs',
49+
50+
// Reflexive pronouns
51+
'myself', 'yourself', 'yourselves', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
52+
53+
// Demonstrative pronouns
54+
'this', 'that', 'these', 'those',
55+
56+
// Interrogative and relative pronouns
57+
'what', 'which', 'who', 'whom', 'whose',
58+
59+
// Indefinite pronouns and determiners
60+
'all', 'any', 'both', 'each', 'every', 'few', 'many', 'more', 'most',
61+
'other', 'others', 'some', 'such', 'none', 'several',
62+
'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything',
63+
'nobody', 'nothing', 'somebody', 'someone', 'something',
64+
65+
// Coordinating conjunctions
66+
'and', 'but', 'or', 'nor', 'yet', 'so', 'for',
67+
68+
// Subordinating conjunctions
69+
'if', 'because', 'as', 'until', 'while', 'although', 'though', 'unless',
70+
'since', 'when', 'where', 'whether', 'whereas', 'wherever', 'whenever',
71+
72+
// Verb "to be" (all forms)
73+
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
74+
75+
// Verb "to have" (all forms)
76+
'have', 'has', 'had', 'having',
77+
78+
// Verb "to do" (all forms)
79+
'do', 'does', 'did', 'doing', 'done',
80+
81+
// Modal verbs
82+
'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
83+
84+
// Common auxiliary forms
85+
'get', 'gets', 'got', 'getting',
86+
'let', 'lets',
87+
88+
// Adverbs of place
89+
'here', 'there', 'where', 'anywhere', 'everywhere', 'somewhere', 'nowhere',
90+
91+
// Adverbs of time
92+
'now', 'then', 'when', 'always', 'never', 'often', 'sometimes', 'usually',
93+
'already', 'still', 'yet', 'again', 'once', 'ever',
94+
'before', 'after', 'soon', 'later', 'recently', 'today', 'yesterday', 'tomorrow',
95+
96+
// Adverbs of degree
97+
'very', 'too', 'quite', 'rather', 'almost', 'enough', 'just', 'only',
98+
'even', 'also', 'well', 'much', 'more', 'most', 'less', 'least',
99+
100+
// Other common adverbs
101+
'how', 'why', 'further', 'back',
102+
103+
// Negation
104+
'no', 'not',
105+
106+
// Other function words
107+
'own', 'same', 'than', 'like', 'per', 'via',
108+
109+
// Contractions (tokenizer splits on apostrophes, leaving these fragments)
110+
's', 't', 'd', 'm', 've', 'll', 're',
111+
'don', 'doesn', 'didn', 'won', 'wouldn', 'shouldn', 'couldn', 'can',
112+
'hasn', 'haven', 'hadn', 'isn', 'aren', 'wasn', 'weren', 'ain',
113+
];
114+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PHPVector\BM25\StopWords;
6+
7+
/**
8+
* Load stop words from a file.
9+
*
10+
* This class demonstrates the value of the StopWordsProviderInterface:
11+
* stop words can come from any source, not just hardcoded arrays.
12+
*
13+
* Expected file format:
14+
* - One word per line
15+
* - Empty lines and lines starting with # are ignored
16+
* - Words are automatically lowercased
17+
*
18+
* Example file:
19+
* ```
20+
* # English stop words
21+
* the
22+
* a
23+
* an
24+
* is
25+
* are
26+
* ```
27+
*/
28+
final class FileStopWords implements StopWordsProviderInterface
29+
{
30+
/** @var string[]|null Cached stop words (loaded once) */
31+
private ?array $words = null;
32+
33+
/**
34+
* @param string $filePath Path to the stop words file.
35+
* @throws \InvalidArgumentException if the file does not exist.
36+
*/
37+
public function __construct(
38+
private readonly string $filePath,
39+
) {
40+
if (!file_exists($filePath)) {
41+
throw new \InvalidArgumentException(
42+
sprintf('Stop words file not found: %s', $filePath)
43+
);
44+
}
45+
}
46+
47+
public function getStopWords(): array
48+
{
49+
if ($this->words !== null) {
50+
return $this->words;
51+
}
52+
53+
$this->words = $this->loadFromFile();
54+
return $this->words;
55+
}
56+
57+
/**
58+
* @return string[]
59+
*/
60+
private function loadFromFile(): array
61+
{
62+
$content = @file_get_contents($this->filePath);
63+
if ($content === false) {
64+
throw new \RuntimeException(
65+
sprintf('Failed to read stop words file: %s', $this->filePath)
66+
);
67+
}
68+
69+
$lines = explode("\n", $content);
70+
$words = [];
71+
72+
foreach ($lines as $line) {
73+
$line = trim($line);
74+
75+
// Skip empty lines and comments
76+
if ($line === '' || str_starts_with($line, '#')) {
77+
continue;
78+
}
79+
80+
$words[] = mb_strtolower($line, 'UTF-8');
81+
}
82+
83+
return $words;
84+
}
85+
}

0 commit comments

Comments
 (0)