Skip to content

Commit 7740538

Browse files
add stop words interface for italian words and file import
1 parent 8308567 commit 7740538

File tree

6 files changed

+382
-25
lines changed

6 files changed

+382
-25
lines changed

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,41 @@ $db = VectorDatabase::open('/var/data/mydb', new HNSWConfig(M: 32));
291291
$results = $db->vectorSearch($queryVector, k: 10);
292292
```
293293

294+
## Multi-language stop words
295+
296+
Stop words are provided via `StopWordsProviderInterface`. Built-in providers:
297+
298+
```php
299+
use PHPVector\BM25\SimpleTokenizer;
300+
use PHPVector\BM25\StopWords\EnglishStopWords;
301+
use PHPVector\BM25\StopWords\ItalianStopWords;
302+
use PHPVector\BM25\StopWords\FileStopWords;
303+
use PHPVector\VectorDatabase;
304+
305+
// English (default)
306+
$db = new VectorDatabase();
307+
308+
// Italian
309+
$db = new VectorDatabase(
310+
tokenizer: new SimpleTokenizer(new ItalianStopWords()),
311+
);
312+
313+
// Load from file (one word per line, # for comments)
314+
$db = new VectorDatabase(
315+
tokenizer: new SimpleTokenizer(new FileStopWords('/path/to/stopwords.txt')),
316+
);
317+
318+
// No stop words
319+
$db = new VectorDatabase(
320+
tokenizer: new SimpleTokenizer(stopWords: []),
321+
);
322+
```
323+
324+
Available providers:
325+
- `EnglishStopWords` - English stop words (default)
326+
- `ItalianStopWords` - Italian stop words
327+
- `FileStopWords` - Load from file
328+
294329
## Custom tokenizer
295330

296331
Implement `TokenizerInterface` to plug in stemming, lemmatization, or any language-specific logic.

src/BM25/SimpleTokenizer.php

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
namespace PHPVector\BM25;
66

7+
use PHPVector\BM25\StopWords\EnglishStopWords;
8+
use PHPVector\BM25\StopWords\StopWordsProviderInterface;
9+
710
/**
811
* A lightweight, language-agnostic tokenizer.
912
*
@@ -19,15 +22,19 @@ final class SimpleTokenizer implements TokenizerInterface
1922
private readonly array $stopWords;
2023

2124
/**
22-
* @param string[] $stopWords Words to discard (case-insensitive).
23-
* @param int $minTokenLength Minimum token length to keep (default: 2).
25+
* @param StopWordsProviderInterface|string[] $stopWords Stop words provider or array of words.
26+
* @param int $minTokenLength Minimum token length to keep (default: 2).
2427
*/
2528
public function __construct(
26-
array $stopWords = self::DEFAULT_STOP_WORDS,
29+
StopWordsProviderInterface|array $stopWords = new EnglishStopWords(),
2730
private readonly int $minTokenLength = 2,
2831
) {
32+
$words = $stopWords instanceof StopWordsProviderInterface
33+
? $stopWords->getStopWords()
34+
: $stopWords;
35+
2936
$this->stopWords = array_fill_keys(
30-
array_map('mb_strtolower', $stopWords),
37+
array_map('mb_strtolower', $words),
3138
true,
3239
);
3340
}
@@ -49,25 +56,4 @@ public function tokenize(string $text): array
4956
}
5057
return $result;
5158
}
52-
53-
/**
54-
* Common English stop words.
55-
* Replace or extend via the constructor for other languages or domains.
56-
*/
57-
public const DEFAULT_STOP_WORDS = [
58-
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
59-
'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
60-
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
61-
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
62-
'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
63-
'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
64-
'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
65-
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
66-
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
67-
'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
68-
'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
69-
'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
70-
'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
71-
'don', 'should', 'now'
72-
];
7359
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PHPVector\BM25\StopWords;
6+
7+
/**
8+
* English stop words provider.
9+
*
10+
* Includes articles, prepositions, pronouns, conjunctions, auxiliary verbs,
11+
* and other high-frequency words that carry little semantic value.
12+
*/
13+
final class EnglishStopWords implements StopWordsProviderInterface
14+
{
15+
public function getStopWords(): array
16+
{
17+
return self::WORDS;
18+
}
19+
20+
/**
21+
* Static access for use without instantiation.
22+
*
23+
* @return string[]
24+
*/
25+
public static function words(): array
26+
{
27+
return self::WORDS;
28+
}
29+
30+
private const WORDS = [
31+
// Articles
32+
'a', 'an', 'the',
33+
34+
// Prepositions
35+
'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
36+
'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
37+
'in', 'out', 'on', 'off', 'over', 'under', 'upon', 'within', 'without',
38+
'along', 'among', 'around', 'across', 'behind', 'beyond', 'near', 'toward', 'towards',
39+
40+
// Personal pronouns (subject)
41+
'i', 'you', 'he', 'she', 'it', 'we', 'they',
42+
43+
// Personal pronouns (object)
44+
'me', 'him', 'her', 'us', 'them',
45+
46+
// Possessive adjectives and pronouns
47+
'my', 'your', 'his', 'her', 'its', 'our', 'their',
48+
'mine', 'yours', 'hers', 'ours', 'theirs',
49+
50+
// Reflexive pronouns
51+
'myself', 'yourself', 'yourselves', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
52+
53+
// Demonstrative pronouns
54+
'this', 'that', 'these', 'those',
55+
56+
// Interrogative and relative pronouns
57+
'what', 'which', 'who', 'whom', 'whose',
58+
59+
// Indefinite pronouns and determiners
60+
'all', 'any', 'both', 'each', 'every', 'few', 'many', 'more', 'most',
61+
'other', 'others', 'some', 'such', 'none', 'several',
62+
'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything',
63+
'nobody', 'nothing', 'somebody', 'someone', 'something',
64+
65+
// Coordinating conjunctions
66+
'and', 'but', 'or', 'nor', 'yet', 'so', 'for',
67+
68+
// Subordinating conjunctions
69+
'if', 'because', 'as', 'until', 'while', 'although', 'though', 'unless',
70+
'since', 'when', 'where', 'whether', 'whereas', 'wherever', 'whenever',
71+
72+
// Verb "to be" (all forms)
73+
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
74+
75+
// Verb "to have" (all forms)
76+
'have', 'has', 'had', 'having',
77+
78+
// Verb "to do" (all forms)
79+
'do', 'does', 'did', 'doing', 'done',
80+
81+
// Modal verbs
82+
'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must',
83+
84+
// Common auxiliary forms
85+
'get', 'gets', 'got', 'getting',
86+
'let', 'lets',
87+
88+
// Adverbs of place
89+
'here', 'there', 'where', 'anywhere', 'everywhere', 'somewhere', 'nowhere',
90+
91+
// Adverbs of time
92+
'now', 'then', 'when', 'always', 'never', 'often', 'sometimes', 'usually',
93+
'already', 'still', 'yet', 'again', 'once', 'ever',
94+
'before', 'after', 'soon', 'later', 'recently', 'today', 'yesterday', 'tomorrow',
95+
96+
// Adverbs of degree
97+
'very', 'too', 'quite', 'rather', 'almost', 'enough', 'just', 'only',
98+
'even', 'also', 'well', 'much', 'more', 'most', 'less', 'least',
99+
100+
// Other common adverbs
101+
'how', 'why', 'further', 'back',
102+
103+
// Negation
104+
'no', 'not',
105+
106+
// Other function words
107+
'own', 'same', 'than', 'like', 'per', 'via',
108+
109+
// Contractions (tokenizer splits on apostrophes, leaving these fragments)
110+
's', 't', 'd', 'm', 've', 'll', 're',
111+
'don', 'doesn', 'didn', 'won', 'wouldn', 'shouldn', 'couldn', 'can',
112+
'hasn', 'haven', 'hadn', 'isn', 'aren', 'wasn', 'weren', 'ain',
113+
];
114+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PHPVector\BM25\StopWords;
6+
7+
/**
8+
* Load stop words from a file.
9+
*
10+
* This class demonstrates the value of the StopWordsProviderInterface:
11+
* stop words can come from any source, not just hardcoded arrays.
12+
*
13+
* Expected file format:
14+
* - One word per line
15+
* - Empty lines and lines starting with # are ignored
16+
* - Words are automatically lowercased
17+
*
18+
* Example file:
19+
* ```
20+
* # English stop words
21+
* the
22+
* a
23+
* an
24+
* is
25+
* are
26+
* ```
27+
*/
28+
final class FileStopWords implements StopWordsProviderInterface
29+
{
30+
/** @var string[]|null Cached stop words (loaded once) */
31+
private ?array $words = null;
32+
33+
/**
34+
* @param string $filePath Path to the stop words file.
35+
* @throws \InvalidArgumentException if the file does not exist.
36+
*/
37+
public function __construct(
38+
private readonly string $filePath,
39+
) {
40+
if (!file_exists($filePath)) {
41+
throw new \InvalidArgumentException(
42+
sprintf('Stop words file not found: %s', $filePath)
43+
);
44+
}
45+
}
46+
47+
public function getStopWords(): array
48+
{
49+
if ($this->words !== null) {
50+
return $this->words;
51+
}
52+
53+
$this->words = $this->loadFromFile();
54+
return $this->words;
55+
}
56+
57+
/**
58+
* @return string[]
59+
*/
60+
private function loadFromFile(): array
61+
{
62+
$content = file_get_contents($this->filePath);
63+
if ($content === false) {
64+
throw new \RuntimeException(
65+
sprintf('Failed to read stop words file: %s', $this->filePath)
66+
);
67+
}
68+
69+
$lines = explode("\n", $content);
70+
$words = [];
71+
72+
foreach ($lines as $line) {
73+
$line = trim($line);
74+
75+
// Skip empty lines and comments
76+
if ($line === '' || str_starts_with($line, '#')) {
77+
continue;
78+
}
79+
80+
$words[] = mb_strtolower($line, 'UTF-8');
81+
}
82+
83+
return $words;
84+
}
85+
}

0 commit comments

Comments
 (0)