dmlc · mfdel · Jan 14, 2026
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -2981,6 +2981,70 @@ def inplace_predict(
             "Data type:" + str(type(data)) + " not supported by inplace prediction."
         )
 
+    def compute_leaf_similarity(
+        self,
+        data: DMatrix,
+        reference: DMatrix,
+        weight_type: str = "gain",
+    ) -> np.ndarray:
+        """Compute similarity between observations based on leaf node co-occurrence.
+
+        Two samples are similar if they land in the same leaf nodes across trees.
+        This is similar to Random Forest proximity matrices.
+
+        Parameters
+        ----------
+        data :
+            Query dataset (m samples).
+        reference :
+            Reference dataset (n samples).
+        weight_type :
+            How to weight trees: "gain" (by loss improvement) or "cover"
+            (by hessian sum, approximately sample count for regression).
+
+        Returns
+        -------
+        similarity : ndarray of shape (m, n)
+            Similarity scores in [0, 1].
+        """
+        if weight_type not in ("gain", "cover"):
+            raise ValueError(
+                f"weight_type must be 'gain' or 'cover', got '{weight_type}'"
+            )
+
+        query_leaves = self.predict(data, pred_leaf=True)
+        ref_leaves = self.predict(reference, pred_leaf=True)
+
+        if query_leaves.ndim == 1:
+            query_leaves = query_leaves.reshape(-1, 1)
+        if ref_leaves.ndim == 1:
+            ref_leaves = ref_leaves.reshape(-1, 1)
+
+        n_trees = query_leaves.shape[1]
+
+        trees_df = self.trees_to_dataframe()
+        split_nodes = trees_df[trees_df["Feature"] != "Leaf"]
+        col = "Gain" if weight_type == "gain" else "Cover"
+        tree_weights = split_nodes.groupby("Tree")[col].sum()
+
+        weights = np.zeros(n_trees, dtype=np.float32)
+        for tree_id, w in tree_weights.items():
+            if tree_id < n_trees:
+                weights[int(tree_id)] = w
+
+        if weights.sum() == 0:
+            weights = np.ones(n_trees, dtype=np.float32)
+
+        total_weight = weights.sum()
+        m, n = len(query_leaves), len(ref_leaves)
+
+        similarity = np.zeros((m, n), dtype=np.float32)
+        for i in range(m):
+            matches_i = query_leaves[i] == ref_leaves
+            similarity[i] = (matches_i * weights).sum(axis=1) / total_weight
+
+        return similarity
+
     def save_model(self, fname: PathLike) -> None:
         """Save the model to a file.
 

diff --git a/tests/python/test_leaf_similarity.py b/tests/python/test_leaf_similarity.py
@@ -0,0 +1,47 @@
+"""Tests for leaf similarity computation."""
+
+import numpy as np
+import pytest
+
+import xgboost as xgb
+from xgboost import testing as tm
+
+rng = np.random.RandomState(1994)
+
+
+class TestLeafSimilarity:
+    """Tests for Booster.compute_leaf_similarity()"""
+
+    def test_leaf_similarity(self) -> None:
+        """Test basic leaf similarity computation."""
+        dtrain, _ = tm.load_agaricus(__file__)
+        param = {"max_depth": 4, "eta": 0.3, "objective": "binary:logistic"}
+        bst = xgb.train(param, dtrain, num_boost_round=10)
+
+        X = dtrain.get_data()
+        dm_query = xgb.DMatrix(X[:10])
+        dm_ref = xgb.DMatrix(X[100:150])
+
+        # Test shape and range
+        similarity = bst.compute_leaf_similarity(dm_query, dm_ref)
+        assert similarity.shape == (10, 50)
+        assert similarity.min() >= 0.0
+        assert similarity.max() <= 1.0
+
+        # Self-similarity diagonal should be 1.0
+        dm_self = xgb.DMatrix(X[:20])
+        self_sim = bst.compute_leaf_similarity(dm_self, dm_self)
+        np.testing.assert_allclose(np.diag(self_sim), 1.0, rtol=1e-5)
+
+        # Test weight types
+        sim_gain = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="gain")
+        sim_cover = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="cover")
+        assert sim_gain.shape == sim_cover.shape
+
+        # Default should be gain
+        sim_default = bst.compute_leaf_similarity(dm_query, dm_ref)
+        np.testing.assert_array_equal(sim_default, sim_gain)
+
+        # Invalid weight_type
+        with pytest.raises(ValueError, match="weight_type must be"):
+            bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="invalid")