dorianbrown · tavian-dev · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ So far the algorithms that have been implemented are:
 - [x] Okapi BM25
 - [x] BM25L
 - [x] BM25+
+- [x] BM25F
 - [ ] BM25-Adpt
 - [ ] BM25T 
 
@@ -71,3 +72,28 @@ bm25.get_top_n(tokenized_query, corpus, n=1)
 # ['It is quite windy in London']
 ```
 And that's pretty much it!
+
+### Multi-field ranking with BM25F
+
+If your documents have multiple fields (e.g. title, body, tags), you can use `BM25F` to score them with per-field boosting. Unlike scoring each field separately and summing, BM25F combines term frequencies across fields *before* applying saturation, which avoids over-estimating the importance of terms that appear in multiple fields.
+
+```python
+from rank_bm25 import BM25F
+
+corpus = [
+    {"title": ["machine", "learning"], "body": ["introduction", "to", "machine", "learning"]},
+    {"title": ["deep", "neural", "networks"], "body": ["image", "recognition", "with", "CNNs"]},
+    {"title": ["natural", "language"], "body": ["text", "classification", "using", "transformers"]},
+]
+
+bm25f = BM25F(corpus, field_weights={"title": 2.0, "body": 1.0})
+
+scores = bm25f.get_scores(["machine", "learning"])
+```
+
+Parameters:
+- `field_weights`: boost weight per field (default 1.0 for all fields)
+- `field_b`: length normalization per field (default 0.75)
+- `k1`: saturation parameter (default 1.5)
+
+Documents can have different fields — missing fields are treated as empty. Like the other algorithms, `BM25F` also supports `get_batch_scores()`, `get_top_n()`, and the `tokenizer` parameter.
diff --git a/rank_bm25.py b/rank_bm25.py
@@ -208,6 +208,162 @@ def get_batch_scores(self, query, doc_ids):
         return score.tolist()
 
 
+class BM25F:
+    """BM25F: field-aware BM25 that combines term frequencies across fields
+    before applying saturation, avoiding the over-estimation that occurs when
+    scoring each field independently and summing.
+
+    Reference: Robertson, S., Zaragoza, H., & Taylor, M. (2004).
+    Simple BM25 extension to multiple weighted fields.
+    """
+
+    def __init__(self, corpus, field_weights=None, field_b=None, k1=1.5, epsilon=0.25, tokenizer=None):
+        """
+        Parameters
+        ----------
+        corpus : list of dicts
+            Each document is a dict mapping field names to token lists, e.g.
+            [{"title": ["hello", "world"], "body": ["foo", "bar", "baz"]}]
+        field_weights : dict, optional
+            Boost weight per field, e.g. {"title": 2.0, "body": 1.0}.
+            Fields not listed default to 1.0.
+        field_b : dict, optional
+            Length-normalization parameter per field (0.0-1.0), e.g. {"title": 0.75, "body": 0.75}.
+            Fields not listed default to 0.75.
+        k1 : float
+            Saturation parameter (shared across fields). Default 1.5.
+        epsilon : float
+            Floor for IDF values, as a fraction of average IDF. Default 0.25.
+        tokenizer : callable, optional
+            If provided, each field value is passed through this function.
+        """
+        self.k1 = k1
+        self.epsilon = epsilon
+        self.tokenizer = tokenizer
+        self.corpus_size = 0
+        self.idf = {}
+
+        if tokenizer:
+            corpus = [{f: tokenizer(v) if isinstance(v, str) else v
+                       for f, v in doc.items()} for doc in corpus]
+
+        # Discover all field names from corpus
+        self.fields = sorted({f for doc in corpus for f in doc})
+        self.field_weights = {f: (field_weights or {}).get(f, 1.0) for f in self.fields}
+        self.field_b = {f: (field_b or {}).get(f, 0.75) for f in self.fields}
+
+        nd = self._initialize(corpus)
+        self._calc_idf(nd)
+
+    def _initialize(self, corpus):
+        nd = {}  # word -> number of documents containing word (in any field)
+        self.corpus_size = len(corpus)
+
+        # Per-field: doc lengths and term frequencies
+        self.field_doc_len = {f: [] for f in self.fields}
+        self.field_avgdl = {}
+        self.field_doc_freqs = {f: [] for f in self.fields}
+
+        for doc in corpus:
+            doc_terms = set()
+            for field in self.fields:
+                tokens = doc.get(field, [])
+                self.field_doc_len[field].append(len(tokens))
+
+                frequencies = {}
+                for word in tokens:
+                    if word not in frequencies:
+                        frequencies[word] = 0
+                    frequencies[word] += 1
+                    doc_terms.add(word)
+                self.field_doc_freqs[field].append(frequencies)
+
+            for word in doc_terms:
+                try:
+                    nd[word] += 1
+                except KeyError:
+                    nd[word] = 1
+
+        for field in self.fields:
+            total = sum(self.field_doc_len[field])
+            self.field_avgdl[field] = total / self.corpus_size if self.corpus_size else 0
+
+        return nd
+
+    def _calc_idf(self, nd):
+        idf_sum = 0
+        negative_idfs = []
+        for word, freq in nd.items():
+            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
+            self.idf[word] = idf
+            idf_sum += idf
+            if idf < 0:
+                negative_idfs.append(word)
+        self.average_idf = idf_sum / len(self.idf) if self.idf else 0
+
+        eps = self.epsilon * self.average_idf
+        for word in negative_idfs:
+            self.idf[word] = eps
+
+    def get_scores(self, query):
+        """Score all documents against a tokenized query.
+
+        For each query term, computes a combined term frequency across fields:
+            tf_combined = sum over fields of: weight_f * tf(t,d,f) / (1 - b_f + b_f * dl_f / avgdl_f)
+        Then applies BM25 saturation once:
+            score += idf(t) * tf_combined * (k1 + 1) / (tf_combined + k1)
+        """
+        score = np.zeros(self.corpus_size)
+        for q in query:
+            # Compute combined TF across all fields
+            tf_combined = np.zeros(self.corpus_size)
+            for field in self.fields:
+                w = self.field_weights[field]
+                b = self.field_b[field]
+                avgdl = self.field_avgdl[field]
+
+                q_freq = np.array([(doc.get(q) or 0) for doc in self.field_doc_freqs[field]])
+                dl = np.array(self.field_doc_len[field])
+
+                if avgdl > 0:
+                    tf_combined += w * q_freq / (1 - b + b * dl / avgdl)
+                else:
+                    tf_combined += w * q_freq
+
+            idf = self.idf.get(q) or 0
+            score += idf * (tf_combined * (self.k1 + 1)) / (tf_combined + self.k1)
+        return score
+
+    def get_batch_scores(self, query, doc_ids):
+        """Score a subset of documents against a tokenized query."""
+        assert all(di < self.corpus_size for di in doc_ids)
+        score = np.zeros(len(doc_ids))
+        for q in query:
+            tf_combined = np.zeros(len(doc_ids))
+            for field in self.fields:
+                w = self.field_weights[field]
+                b = self.field_b[field]
+                avgdl = self.field_avgdl[field]
+
+                q_freq = np.array([(self.field_doc_freqs[field][di].get(q) or 0) for di in doc_ids])
+                dl = np.array(self.field_doc_len[field])[doc_ids]
+
+                if avgdl > 0:
+                    tf_combined += w * q_freq / (1 - b + b * dl / avgdl)
+                else:
+                    tf_combined += w * q_freq
+
+            idf = self.idf.get(q) or 0
+            score += idf * (tf_combined * (self.k1 + 1)) / (tf_combined + self.k1)
+        return score.tolist()
+
+    def get_top_n(self, query, documents, n=5):
+        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
+        scores = self.get_scores(query)
+        top_n = np.argsort(scores)[::-1][:n]
+        return [documents[i] for i in top_n]
+
+
 # BM25Adpt and BM25T are a bit more complicated than the previous algorithms here. Here a term-specific k1
 # parameter is calculated before scoring is done
 

diff --git a/tests/test_loading.py b/tests/test_loading.py
@@ -3,7 +3,8 @@
 myPath = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, myPath + '/../')
 
-from rank_bm25 import BM25Okapi, BM25L, BM25Plus
+from rank_bm25 import BM25Okapi, BM25L, BM25Plus, BM25F
+import numpy as np
 import re
 
 
@@ -37,3 +38,117 @@ def test_tokenizer():
     assert bm25.corpus_size == 3
     assert bm25.avgdl == 5
     assert bm25.doc_len == [4, 6, 5]
+
+
+# BM25F tests
+
+field_corpus = [
+    {"title": ["machine", "learning", "basics"], "body": ["introduction", "to", "machine", "learning", "algorithms"]},
+    {"title": ["deep", "neural", "networks"], "body": ["convolutional", "neural", "networks", "for", "image", "recognition"]},
+    {"title": ["natural", "language", "processing"], "body": ["text", "classification", "using", "transformers"]},
+    {"title": ["reinforcement", "learning"], "body": ["policy", "gradient", "methods", "for", "control"]},
+    {"title": ["computer", "vision"], "body": ["object", "detection", "and", "segmentation"]},
+    {"title": ["data", "engineering"], "body": ["building", "data", "pipelines", "at", "scale"]},
+]
+
+
+def test_bm25f_corpus_loading():
+    bm25f = BM25F(field_corpus)
+    assert bm25f.corpus_size == 6
+    assert set(bm25f.fields) == {"title", "body"}
+    assert bm25f.field_doc_len["title"] == [3, 3, 3, 2, 2, 2]
+    assert bm25f.field_doc_len["body"] == [5, 6, 4, 5, 4, 5]
+
+
+def test_bm25f_field_weights():
+    bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
+    assert bm25f.field_weights == {"title": 3.0, "body": 1.0}
+
+
+def test_bm25f_default_params():
+    bm25f = BM25F(field_corpus)
+    for f in bm25f.fields:
+        assert bm25f.field_weights[f] == 1.0
+        assert bm25f.field_b[f] == 0.75
+
+
+def test_bm25f_scoring():
+    bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
+    scores = bm25f.get_scores(["learning"])
+    # Doc 0 has "learning" in title and body
+    # Doc 3 has "learning" in title only
+    # Both should score > 0, rest should be 0
+    assert scores[0] > 0
+    assert scores[3] > 0
+    assert scores[1] == 0
+    assert scores[2] == 0
+    assert scores[4] == 0
+    assert scores[5] == 0
+
+
+def test_bm25f_title_boost_ranking():
+    """Documents with query terms in a boosted field should rank higher."""
+    bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
+    scores = bm25f.get_scores(["learning"])
+    # Doc 0 has learning in both fields -> highest
+    # Doc 3 has learning in title only -> second
+    assert scores[0] > scores[3]
+
+
+def test_bm25f_matches_okapi_single_field():
+    """BM25F with a single field should produce identical scores to BM25Okapi."""
+    flat_corpus = [
+        ["hello", "world", "foo"],
+        ["bar", "baz", "qux"],
+        ["hello", "bar", "world", "quux"],
+        ["foo", "baz"],
+        ["hello", "qux", "quux", "bar", "world"],
+    ]
+    bm25 = BM25Okapi(flat_corpus, k1=1.5, b=0.75)
+    okapi_scores = bm25.get_scores(["hello", "world"])
+
+    corpus_f = [{"text": doc} for doc in flat_corpus]
+    bm25f = BM25F(corpus_f, field_weights={"text": 1.0}, field_b={"text": 0.75}, k1=1.5)
+    f_scores = bm25f.get_scores(["hello", "world"])
+
+    assert np.allclose(okapi_scores, f_scores)
+
+
+def test_bm25f_batch_scores():
+    bm25f = BM25F(field_corpus, field_weights={"title": 2.0, "body": 1.0})
+    full_scores = bm25f.get_scores(["learning"])
+    batch_scores = bm25f.get_batch_scores(["learning"], [0, 3])
+    assert np.isclose(batch_scores[0], full_scores[0])
+    assert np.isclose(batch_scores[1], full_scores[3])
+
+
+def test_bm25f_get_top_n():
+    bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
+    labels = ["doc0", "doc1", "doc2", "doc3", "doc4", "doc5"]
+    top = bm25f.get_top_n(["learning"], labels, n=2)
+    assert top == ["doc0", "doc3"]
+
+
+def test_bm25f_sparse_fields():
+    """Documents with missing fields should work correctly."""
+    sparse = [
+        {"title": ["hello"]},
+        {"title": ["world"], "body": ["hello", "foo"]},
+        {"body": ["bar", "baz"]},
+    ]
+    bm25f = BM25F(sparse)
+    scores = bm25f.get_scores(["hello"])
+    assert scores[0] > 0
+    assert scores[1] > 0
+    assert scores[2] == 0
+
+
+def test_bm25f_tokenizer():
+    str_corpus = [
+        {"title": "hello world", "body": "foo bar baz"},
+        {"title": "test doc", "body": "hello again"},
+    ]
+    bm25f = BM25F(str_corpus, tokenizer=str.split)
+    assert bm25f.corpus_size == 2
+    assert bm25f.field_doc_len["title"] == [2, 2]
+    assert bm25f.field_doc_len["body"] == [3, 2]