Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2981,6 +2981,70 @@ def inplace_predict(
"Data type:" + str(type(data)) + " not supported by inplace prediction."
)

def compute_leaf_similarity(
self,
data: DMatrix,
reference: DMatrix,
weight_type: str = "gain",
) -> np.ndarray:
"""Compute similarity between observations based on leaf node co-occurrence.

Two samples are similar if they land in the same leaf nodes across trees.
This is similar to Random Forest proximity matrices.

Parameters
----------
data :
Query dataset (m samples).
reference :
Reference dataset (n samples).
weight_type :
How to weight trees: "gain" (by loss improvement) or "cover"
(by hessian sum, approximately sample count for regression).

Returns
-------
similarity : ndarray of shape (m, n)
Similarity scores in [0, 1].
"""
if weight_type not in ("gain", "cover"):
raise ValueError(
f"weight_type must be 'gain' or 'cover', got '{weight_type}'"
)

query_leaves = self.predict(data, pred_leaf=True)
ref_leaves = self.predict(reference, pred_leaf=True)

if query_leaves.ndim == 1:
query_leaves = query_leaves.reshape(-1, 1)
if ref_leaves.ndim == 1:
ref_leaves = ref_leaves.reshape(-1, 1)

n_trees = query_leaves.shape[1]

trees_df = self.trees_to_dataframe()
split_nodes = trees_df[trees_df["Feature"] != "Leaf"]
col = "Gain" if weight_type == "gain" else "Cover"
tree_weights = split_nodes.groupby("Tree")[col].sum()

weights = np.zeros(n_trees, dtype=np.float32)
for tree_id, w in tree_weights.items():
if tree_id < n_trees:
weights[int(tree_id)] = w

if weights.sum() == 0:
weights = np.ones(n_trees, dtype=np.float32)

total_weight = weights.sum()
m, n = len(query_leaves), len(ref_leaves)

similarity = np.zeros((m, n), dtype=np.float32)
for i in range(m):
matches_i = query_leaves[i] == ref_leaves
similarity[i] = (matches_i * weights).sum(axis=1) / total_weight

return similarity

def save_model(self, fname: PathLike) -> None:
"""Save the model to a file.

Expand Down
47 changes: 47 additions & 0 deletions tests/python/test_leaf_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Tests for leaf similarity computation."""

import numpy as np
import pytest

import xgboost as xgb
from xgboost import testing as tm

rng = np.random.RandomState(1994)


class TestLeafSimilarity:
"""Tests for Booster.compute_leaf_similarity()"""

def test_leaf_similarity(self) -> None:
"""Test basic leaf similarity computation."""
dtrain, _ = tm.load_agaricus(__file__)
param = {"max_depth": 4, "eta": 0.3, "objective": "binary:logistic"}
bst = xgb.train(param, dtrain, num_boost_round=10)

X = dtrain.get_data()
dm_query = xgb.DMatrix(X[:10])
dm_ref = xgb.DMatrix(X[100:150])

# Test shape and range
similarity = bst.compute_leaf_similarity(dm_query, dm_ref)
assert similarity.shape == (10, 50)
assert similarity.min() >= 0.0
assert similarity.max() <= 1.0

# Self-similarity diagonal should be 1.0
dm_self = xgb.DMatrix(X[:20])
self_sim = bst.compute_leaf_similarity(dm_self, dm_self)
np.testing.assert_allclose(np.diag(self_sim), 1.0, rtol=1e-5)

# Test weight types
sim_gain = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="gain")
sim_cover = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="cover")
assert sim_gain.shape == sim_cover.shape

# Default should be gain
sim_default = bst.compute_leaf_similarity(dm_query, dm_ref)
np.testing.assert_array_equal(sim_default, sim_gain)

# Invalid weight_type
with pytest.raises(ValueError, match="weight_type must be"):
bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="invalid")
Loading