Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ So far the algorithms that have been implemented are:
- [x] Okapi BM25
- [x] BM25L
- [x] BM25+
- [x] BM25F
- [ ] BM25-Adpt
- [ ] BM25T

Expand Down Expand Up @@ -71,3 +72,28 @@ bm25.get_top_n(tokenized_query, corpus, n=1)
# ['It is quite windy in London']
```
And that's pretty much it!

### Multi-field ranking with BM25F

If your documents have multiple fields (e.g. title, body, tags), you can use `BM25F` to score them with per-field boosting. Unlike scoring each field separately and summing, BM25F combines term frequencies across fields *before* applying saturation, which avoids over-estimating the importance of terms that appear in multiple fields.

```python
from rank_bm25 import BM25F

corpus = [
{"title": ["machine", "learning"], "body": ["introduction", "to", "machine", "learning"]},
{"title": ["deep", "neural", "networks"], "body": ["image", "recognition", "with", "CNNs"]},
{"title": ["natural", "language"], "body": ["text", "classification", "using", "transformers"]},
]

bm25f = BM25F(corpus, field_weights={"title": 2.0, "body": 1.0})

scores = bm25f.get_scores(["machine", "learning"])
```

Parameters:
- `field_weights`: boost weight per field (default 1.0 for all fields)
- `field_b`: length normalization per field (default 0.75)
- `k1`: saturation parameter (default 1.5)

Documents can have different fields — missing fields are treated as empty. Like the other algorithms, `BM25F` also supports `get_batch_scores()`, `get_top_n()`, and the `tokenizer` parameter.
156 changes: 156 additions & 0 deletions rank_bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,162 @@ def get_batch_scores(self, query, doc_ids):
return score.tolist()


class BM25F:
"""BM25F: field-aware BM25 that combines term frequencies across fields
before applying saturation, avoiding the over-estimation that occurs when
scoring each field independently and summing.

Reference: Robertson, S., Zaragoza, H., & Taylor, M. (2004).
Simple BM25 extension to multiple weighted fields.
"""

def __init__(self, corpus, field_weights=None, field_b=None, k1=1.5, epsilon=0.25, tokenizer=None):
"""
Parameters
----------
corpus : list of dicts
Each document is a dict mapping field names to token lists, e.g.
[{"title": ["hello", "world"], "body": ["foo", "bar", "baz"]}]
field_weights : dict, optional
Boost weight per field, e.g. {"title": 2.0, "body": 1.0}.
Fields not listed default to 1.0.
field_b : dict, optional
Length-normalization parameter per field (0.0-1.0), e.g. {"title": 0.75, "body": 0.75}.
Fields not listed default to 0.75.
k1 : float
Saturation parameter (shared across fields). Default 1.5.
epsilon : float
Floor for IDF values, as a fraction of average IDF. Default 0.25.
tokenizer : callable, optional
If provided, each field value is passed through this function.
"""
self.k1 = k1
self.epsilon = epsilon
self.tokenizer = tokenizer
self.corpus_size = 0
self.idf = {}

if tokenizer:
corpus = [{f: tokenizer(v) if isinstance(v, str) else v
for f, v in doc.items()} for doc in corpus]

# Discover all field names from corpus
self.fields = sorted({f for doc in corpus for f in doc})
self.field_weights = {f: (field_weights or {}).get(f, 1.0) for f in self.fields}
self.field_b = {f: (field_b or {}).get(f, 0.75) for f in self.fields}

nd = self._initialize(corpus)
self._calc_idf(nd)

def _initialize(self, corpus):
nd = {} # word -> number of documents containing word (in any field)
self.corpus_size = len(corpus)

# Per-field: doc lengths and term frequencies
self.field_doc_len = {f: [] for f in self.fields}
self.field_avgdl = {}
self.field_doc_freqs = {f: [] for f in self.fields}

for doc in corpus:
doc_terms = set()
for field in self.fields:
tokens = doc.get(field, [])
self.field_doc_len[field].append(len(tokens))

frequencies = {}
for word in tokens:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
doc_terms.add(word)
self.field_doc_freqs[field].append(frequencies)

for word in doc_terms:
try:
nd[word] += 1
except KeyError:
nd[word] = 1

for field in self.fields:
total = sum(self.field_doc_len[field])
self.field_avgdl[field] = total / self.corpus_size if self.corpus_size else 0

return nd

def _calc_idf(self, nd):
idf_sum = 0
negative_idfs = []
for word, freq in nd.items():
idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
self.idf[word] = idf
idf_sum += idf
if idf < 0:
negative_idfs.append(word)
self.average_idf = idf_sum / len(self.idf) if self.idf else 0

eps = self.epsilon * self.average_idf
for word in negative_idfs:
self.idf[word] = eps

def get_scores(self, query):
"""Score all documents against a tokenized query.

For each query term, computes a combined term frequency across fields:
tf_combined = sum over fields of: weight_f * tf(t,d,f) / (1 - b_f + b_f * dl_f / avgdl_f)
Then applies BM25 saturation once:
score += idf(t) * tf_combined * (k1 + 1) / (tf_combined + k1)
"""
score = np.zeros(self.corpus_size)
for q in query:
# Compute combined TF across all fields
tf_combined = np.zeros(self.corpus_size)
for field in self.fields:
w = self.field_weights[field]
b = self.field_b[field]
avgdl = self.field_avgdl[field]

q_freq = np.array([(doc.get(q) or 0) for doc in self.field_doc_freqs[field]])
dl = np.array(self.field_doc_len[field])

if avgdl > 0:
tf_combined += w * q_freq / (1 - b + b * dl / avgdl)
else:
tf_combined += w * q_freq

idf = self.idf.get(q) or 0
score += idf * (tf_combined * (self.k1 + 1)) / (tf_combined + self.k1)
return score

def get_batch_scores(self, query, doc_ids):
"""Score a subset of documents against a tokenized query."""
assert all(di < self.corpus_size for di in doc_ids)
score = np.zeros(len(doc_ids))
for q in query:
tf_combined = np.zeros(len(doc_ids))
for field in self.fields:
w = self.field_weights[field]
b = self.field_b[field]
avgdl = self.field_avgdl[field]

q_freq = np.array([(self.field_doc_freqs[field][di].get(q) or 0) for di in doc_ids])
dl = np.array(self.field_doc_len[field])[doc_ids]

if avgdl > 0:
tf_combined += w * q_freq / (1 - b + b * dl / avgdl)
else:
tf_combined += w * q_freq

idf = self.idf.get(q) or 0
score += idf * (tf_combined * (self.k1 + 1)) / (tf_combined + self.k1)
return score.tolist()

def get_top_n(self, query, documents, n=5):
assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
scores = self.get_scores(query)
top_n = np.argsort(scores)[::-1][:n]
return [documents[i] for i in top_n]


# BM25Adpt and BM25T are a bit more complicated than the previous algorithms here. Here a term-specific k1
# parameter is calculated before scoring is done

Expand Down
117 changes: 116 additions & 1 deletion tests/test_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
myPath = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, myPath + '/../')

from rank_bm25 import BM25Okapi, BM25L, BM25Plus
from rank_bm25 import BM25Okapi, BM25L, BM25Plus, BM25F
import numpy as np
import re


Expand Down Expand Up @@ -37,3 +38,117 @@ def test_tokenizer():
assert bm25.corpus_size == 3
assert bm25.avgdl == 5
assert bm25.doc_len == [4, 6, 5]


# BM25F tests

field_corpus = [
{"title": ["machine", "learning", "basics"], "body": ["introduction", "to", "machine", "learning", "algorithms"]},
{"title": ["deep", "neural", "networks"], "body": ["convolutional", "neural", "networks", "for", "image", "recognition"]},
{"title": ["natural", "language", "processing"], "body": ["text", "classification", "using", "transformers"]},
{"title": ["reinforcement", "learning"], "body": ["policy", "gradient", "methods", "for", "control"]},
{"title": ["computer", "vision"], "body": ["object", "detection", "and", "segmentation"]},
{"title": ["data", "engineering"], "body": ["building", "data", "pipelines", "at", "scale"]},
]


def test_bm25f_corpus_loading():
bm25f = BM25F(field_corpus)
assert bm25f.corpus_size == 6
assert set(bm25f.fields) == {"title", "body"}
assert bm25f.field_doc_len["title"] == [3, 3, 3, 2, 2, 2]
assert bm25f.field_doc_len["body"] == [5, 6, 4, 5, 4, 5]


def test_bm25f_field_weights():
bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
assert bm25f.field_weights == {"title": 3.0, "body": 1.0}


def test_bm25f_default_params():
bm25f = BM25F(field_corpus)
for f in bm25f.fields:
assert bm25f.field_weights[f] == 1.0
assert bm25f.field_b[f] == 0.75


def test_bm25f_scoring():
bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
scores = bm25f.get_scores(["learning"])
# Doc 0 has "learning" in title and body
# Doc 3 has "learning" in title only
# Both should score > 0, rest should be 0
assert scores[0] > 0
assert scores[3] > 0
assert scores[1] == 0
assert scores[2] == 0
assert scores[4] == 0
assert scores[5] == 0


def test_bm25f_title_boost_ranking():
"""Documents with query terms in a boosted field should rank higher."""
bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
scores = bm25f.get_scores(["learning"])
# Doc 0 has learning in both fields -> highest
# Doc 3 has learning in title only -> second
assert scores[0] > scores[3]


def test_bm25f_matches_okapi_single_field():
"""BM25F with a single field should produce identical scores to BM25Okapi."""
flat_corpus = [
["hello", "world", "foo"],
["bar", "baz", "qux"],
["hello", "bar", "world", "quux"],
["foo", "baz"],
["hello", "qux", "quux", "bar", "world"],
]
bm25 = BM25Okapi(flat_corpus, k1=1.5, b=0.75)
okapi_scores = bm25.get_scores(["hello", "world"])

corpus_f = [{"text": doc} for doc in flat_corpus]
bm25f = BM25F(corpus_f, field_weights={"text": 1.0}, field_b={"text": 0.75}, k1=1.5)
f_scores = bm25f.get_scores(["hello", "world"])

assert np.allclose(okapi_scores, f_scores)


def test_bm25f_batch_scores():
bm25f = BM25F(field_corpus, field_weights={"title": 2.0, "body": 1.0})
full_scores = bm25f.get_scores(["learning"])
batch_scores = bm25f.get_batch_scores(["learning"], [0, 3])
assert np.isclose(batch_scores[0], full_scores[0])
assert np.isclose(batch_scores[1], full_scores[3])


def test_bm25f_get_top_n():
bm25f = BM25F(field_corpus, field_weights={"title": 3.0, "body": 1.0})
labels = ["doc0", "doc1", "doc2", "doc3", "doc4", "doc5"]
top = bm25f.get_top_n(["learning"], labels, n=2)
assert top == ["doc0", "doc3"]


def test_bm25f_sparse_fields():
"""Documents with missing fields should work correctly."""
sparse = [
{"title": ["hello"]},
{"title": ["world"], "body": ["hello", "foo"]},
{"body": ["bar", "baz"]},
]
bm25f = BM25F(sparse)
scores = bm25f.get_scores(["hello"])
assert scores[0] > 0
assert scores[1] > 0
assert scores[2] == 0


def test_bm25f_tokenizer():
str_corpus = [
{"title": "hello world", "body": "foo bar baz"},
{"title": "test doc", "body": "hello again"},
]
bm25f = BM25F(str_corpus, tokenizer=str.split)
assert bm25f.corpus_size == 2
assert bm25f.field_doc_len["title"] == [2, 2]
assert bm25f.field_doc_len["body"] == [3, 2]