|
| 1 | +<?php |
| 2 | + |
| 3 | +declare(strict_types=1); |
| 4 | + |
| 5 | +namespace PHPVector\BM25\StopWords; |
| 6 | + |
| 7 | +/** |
| 8 | + * English stop words provider. |
| 9 | + * |
| 10 | + * Includes articles, prepositions, pronouns, conjunctions, auxiliary verbs, |
| 11 | + * and other high-frequency words that carry little semantic value. |
| 12 | + */ |
| 13 | +final class EnglishStopWords implements StopWordsProviderInterface |
| 14 | +{ |
| 15 | + public function getStopWords(): array |
| 16 | + { |
| 17 | + return self::WORDS; |
| 18 | + } |
| 19 | + |
| 20 | + /** |
| 21 | + * Static access for use without instantiation. |
| 22 | + * |
| 23 | + * @return string[] |
| 24 | + */ |
| 25 | + public static function words(): array |
| 26 | + { |
| 27 | + return self::WORDS; |
| 28 | + } |
| 29 | + |
| 30 | + private const WORDS = [ |
| 31 | + // Articles |
| 32 | + 'a', 'an', 'the', |
| 33 | + |
| 34 | + // Prepositions |
| 35 | + 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', |
| 36 | + 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', |
| 37 | + 'in', 'out', 'on', 'off', 'over', 'under', 'upon', 'within', 'without', |
| 38 | + 'along', 'among', 'around', 'across', 'behind', 'beyond', 'near', 'toward', 'towards', |
| 39 | + |
| 40 | + // Personal pronouns (subject) |
| 41 | + 'i', 'you', 'he', 'she', 'it', 'we', 'they', |
| 42 | + |
| 43 | + // Personal pronouns (object) |
| 44 | + 'me', 'him', 'her', 'us', 'them', |
| 45 | + |
| 46 | + // Possessive adjectives and pronouns |
| 47 | + 'my', 'your', 'his', 'her', 'its', 'our', 'their', |
| 48 | + 'mine', 'yours', 'hers', 'ours', 'theirs', |
| 49 | + |
| 50 | + // Reflexive pronouns |
| 51 | + 'myself', 'yourself', 'yourselves', 'himself', 'herself', 'itself', 'ourselves', 'themselves', |
| 52 | + |
| 53 | + // Demonstrative pronouns |
| 54 | + 'this', 'that', 'these', 'those', |
| 55 | + |
| 56 | + // Interrogative and relative pronouns |
| 57 | + 'what', 'which', 'who', 'whom', 'whose', |
| 58 | + |
| 59 | + // Indefinite pronouns and determiners |
| 60 | + 'all', 'any', 'both', 'each', 'every', 'few', 'many', 'more', 'most', |
| 61 | + 'other', 'others', 'some', 'such', 'none', 'several', |
| 62 | + 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything', |
| 63 | + 'nobody', 'nothing', 'somebody', 'someone', 'something', |
| 64 | + |
| 65 | + // Coordinating conjunctions |
| 66 | + 'and', 'but', 'or', 'nor', 'yet', 'so', 'for', |
| 67 | + |
| 68 | + // Subordinating conjunctions |
| 69 | + 'if', 'because', 'as', 'until', 'while', 'although', 'though', 'unless', |
| 70 | + 'since', 'when', 'where', 'whether', 'whereas', 'wherever', 'whenever', |
| 71 | + |
| 72 | + // Verb "to be" (all forms) |
| 73 | + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', |
| 74 | + |
| 75 | + // Verb "to have" (all forms) |
| 76 | + 'have', 'has', 'had', 'having', |
| 77 | + |
| 78 | + // Verb "to do" (all forms) |
| 79 | + 'do', 'does', 'did', 'doing', 'done', |
| 80 | + |
| 81 | + // Modal verbs |
| 82 | + 'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', |
| 83 | + |
| 84 | + // Common auxiliary forms |
| 85 | + 'get', 'gets', 'got', 'getting', |
| 86 | + 'let', 'lets', |
| 87 | + |
| 88 | + // Adverbs of place |
| 89 | + 'here', 'there', 'where', 'anywhere', 'everywhere', 'somewhere', 'nowhere', |
| 90 | + |
| 91 | + // Adverbs of time |
| 92 | + 'now', 'then', 'when', 'always', 'never', 'often', 'sometimes', 'usually', |
| 93 | + 'already', 'still', 'yet', 'again', 'once', 'ever', |
| 94 | + 'before', 'after', 'soon', 'later', 'recently', 'today', 'yesterday', 'tomorrow', |
| 95 | + |
| 96 | + // Adverbs of degree |
| 97 | + 'very', 'too', 'quite', 'rather', 'almost', 'enough', 'just', 'only', |
| 98 | + 'even', 'also', 'well', 'much', 'more', 'most', 'less', 'least', |
| 99 | + |
| 100 | + // Other common adverbs |
| 101 | + 'how', 'why', 'further', 'back', |
| 102 | + |
| 103 | + // Negation |
| 104 | + 'no', 'not', |
| 105 | + |
| 106 | + // Other function words |
| 107 | + 'own', 'same', 'than', 'like', 'per', 'via', |
| 108 | + |
| 109 | + // Contractions (tokenizer splits on apostrophes, leaving these fragments) |
| 110 | + 's', 't', 'd', 'm', 've', 'll', 're', |
| 111 | + 'don', 'doesn', 'didn', 'won', 'wouldn', 'shouldn', 'couldn', 'can', |
| 112 | + 'hasn', 'haven', 'hadn', 'isn', 'aren', 'wasn', 'weren', 'ain', |
| 113 | + ]; |
| 114 | +} |
0 commit comments