Skip to content

xgboost 3.2.0 crashes with cudf 26.02 when there are categorical features #12138

@sktin

Description

@sktin

Latest stable version of xgboost does not work with cudf 26.02 when there are categorical features. It is fine with cudf 25.12.

Code to replicate:

import numpy as np
import cudf, xgboost
from xgboost import XGBRegressor

print(F'{cudf.__version__=}')
print(F'{xgboost.__version__=}')

rng = np.random.default_rng(0)
X = cudf.DataFrame(rng.random((100,10))).astype(str).astype('category')
y = cudf.Series(rng.random(100))

model = XGBRegressor(enable_categorical=True, device='cuda').fit(X, y)

Expected output:

cudf.__version__='26.02.01'
xgboost.__version__='3.2.0'

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_55/3028577614.py in <cell line: 0>()
     10 y = cudf.Series(rng.random(100))
     11 
---> 12 model = XGBRegressor(enable_categorical=True, device='cuda').fit(X, y)

/usr/local/lib/python3.12/dist-packages/xgboost/core.py in inner_f(*args, **kwargs)
    749             for k, arg in zip(sig.parameters, args):
    750                 kwargs[k] = arg
--> 751             return func(**kwargs)
    752 
    753         return inner_f

/usr/local/lib/python3.12/dist-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1341 
   1342             evals_result: EvalsLog = {}
-> 1343             train_dmatrix, evals = _wrap_evaluation_matrices(
   1344                 missing=self.missing,
   1345                 X=X,

/usr/local/lib/python3.12/dist-packages/xgboost/sklearn.py in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical, feature_types)
    698     """
    699     # Feature_types contains the optional reference categories from the booster object.
--> 700     train_dmatrix = create_dmatrix(
    701         data=X,
    702         label=y,

/usr/local/lib/python3.12/dist-packages/xgboost/sklearn.py in _create_dmatrix(self, ref, **kwargs)
   1260             except TypeError:  # `QuantileDMatrix` supports lesser types than DMatrix
   1261                 pass
-> 1262         return DMatrix(**kwargs, nthread=self.n_jobs)
   1263 
   1264     def _set_evaluation_result(self, evals_result: EvalsLog) -> None:

/usr/local/lib/python3.12/dist-packages/xgboost/core.py in inner_f(*args, **kwargs)
    749             for k, arg in zip(sig.parameters, args):
    750                 kwargs[k] = arg
--> 751             return func(**kwargs)
    752 
    753         return inner_f

/usr/local/lib/python3.12/dist-packages/xgboost/core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical, data_split_mode)
    974             return
    975 
--> 976         handle, feature_names, feature_types = dispatch_data_backend(
    977             data=data,
    978             missing=self.missing,

/usr/local/lib/python3.12/dist-packages/xgboost/data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical, data_split_mode)
   1437         )
   1438     if _is_cudf_df(data) or _is_cudf_ser(data):
-> 1439         return _from_cudf_df(
   1440             data=data,
   1441             missing=missing,

/usr/local/lib/python3.12/dist-packages/xgboost/data.py in _from_cudf_df(data, missing, nthread, feature_names, feature_types, enable_categorical)
   1130     enable_categorical: bool,
   1131 ) -> DispatchedDataBackendReturnType:
-> 1132     df, feature_names, feature_types = _transform_cudf_df(
   1133         data, feature_names, feature_types, enable_categorical
   1134     )

/usr/local/lib/python3.12/dist-packages/xgboost/data.py in _transform_cudf_df(data, feature_names, feature_types, enable_categorical)
   1115 
   1116     return (
-> 1117         CudfTransformed(result, ref_categories=ref_categories),
   1118         feature_names,
   1119         feature_types,

/usr/local/lib/python3.12/dist-packages/xgboost/data.py in __init__(self, columns, ref_categories)
   1026 
   1027         for col in self.columns:
-> 1028             push_series(col)
   1029 
   1030         super().__init__(

/usr/local/lib/python3.12/dist-packages/xgboost/data.py in push_series(ser)
   1017             if _is_df_cat(ser):
   1018                 cats, codes = ser.categories, ser.codes
-> 1019                 cats_ainf, codes_ainf, buf = cudf_cat_inf(cats, codes)
   1020                 temporary_buffers.append(buf)
   1021                 aitfs.append((cats_ainf, codes_ainf))

/usr/local/lib/python3.12/dist-packages/xgboost/_data_utils.py in cudf_cat_inf(cats, codes)
    596 
    597     # pylint: disable=protected-access
--> 598     arrow_col = cats._column.to_pylibcudf(mode="read")
    599     # Tuple[types.CapsuleType, types.CapsuleType]
    600     schema, array = arrow_col.__arrow_c_device_array__()

TypeError: ColumnBase.to_pylibcudf() got an unexpected keyword argument 'mode'

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions