Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@

*.rst text eol=lf
*.md text eol=lf
*.csv text eol=lf
*.csv text eol=lf
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,4 @@ Rplots.pdf

# nsys
*.nsys-rep
rmm_log.dev*
rmm_log.dev*
6 changes: 3 additions & 3 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ Contributors of DMLC/XGBoost
============================
XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.

Project Management Committee(PMC)
Project Management Committee(PMC)
----------
The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members.
The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members.

* [Tianqi Chen](https://github.com/tqchen), University of Washington
- Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
Expand All @@ -19,7 +19,7 @@ The Project Management Committee(PMC) consists group of active committers that m
* [Hyunsu Cho](http://hyunsu-cho.io/), NVIDIA
- Hyunsu is the maintainer of the XGBoost Python package. He also manages the Jenkins continuous integration system (https://xgboost-ci.net/). He is the initial author of the CPU 'hist' updater.
* [Rory Mitchell](https://github.com/RAMitchell), University of Waikato
- Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration.
- Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration.
* [Hongliang Liu](https://github.com/phunterlau)


Expand Down
2 changes: 1 addition & 1 deletion cmake/RPackageInstall.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ set(XGB_DEPS_SCRIPT
check_call(COMMAND "${LIBR_EXECUTABLE}" -q -e "${XGB_DEPS_SCRIPT}")

# Install the XGBoost R package
check_call(COMMAND "${LIBR_EXECUTABLE}" CMD INSTALL --no-multiarch --build "${build_dir}/R-package")
check_call(COMMAND "${LIBR_EXECUTABLE}" CMD INSTALL --no-multiarch --build "${build_dir}/R-package")
69 changes: 39 additions & 30 deletions demo/aft_survival/aft_survival_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,54 +9,63 @@

import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

import xgboost as xgb
from sklearn.model_selection import ShuffleSplit

# The Veterans' Administration Lung Cancer Trial
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
CURRENT_DIR = os.path.dirname(__file__)
df = pd.read_csv(os.path.join(CURRENT_DIR, '../data/veterans_lung_cancer.csv'))
print('Training data:')
df = pd.read_csv(os.path.join(CURRENT_DIR, "../data/veterans_lung_cancer.csv"))
print("Training data:")
print(df)

# Split features and labels
y_lower_bound = df['Survival_label_lower_bound']
y_upper_bound = df['Survival_label_upper_bound']
X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1)
y_lower_bound = df["Survival_label_lower_bound"]
y_upper_bound = df["Survival_label_upper_bound"]
X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1)

# Split data into training and validation sets
rs = ShuffleSplit(n_splits=2, test_size=.7, random_state=0)
rs = ShuffleSplit(n_splits=2, test_size=0.7, random_state=0)
train_index, valid_index = next(rs.split(X))
dtrain = xgb.DMatrix(X.values[train_index, :])
dtrain.set_float_info('label_lower_bound', y_lower_bound[train_index])
dtrain.set_float_info('label_upper_bound', y_upper_bound[train_index])
dtrain.set_float_info("label_lower_bound", y_lower_bound[train_index])
dtrain.set_float_info("label_upper_bound", y_upper_bound[train_index])
dvalid = xgb.DMatrix(X.values[valid_index, :])
dvalid.set_float_info('label_lower_bound', y_lower_bound[valid_index])
dvalid.set_float_info('label_upper_bound', y_upper_bound[valid_index])
dvalid.set_float_info("label_lower_bound", y_lower_bound[valid_index])
dvalid.set_float_info("label_upper_bound", y_upper_bound[valid_index])

# Train gradient boosted trees using AFT loss and metric
params = {'verbosity': 0,
'objective': 'survival:aft',
'eval_metric': 'aft-nloglik',
'tree_method': 'hist',
'learning_rate': 0.05,
'aft_loss_distribution': 'normal',
'aft_loss_distribution_scale': 1.20,
'max_depth': 6,
'lambda': 0.01,
'alpha': 0.02}
bst = xgb.train(params, dtrain, num_boost_round=10000,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
early_stopping_rounds=50)
params = {
"verbosity": 0,
"objective": "survival:aft",
"eval_metric": "aft-nloglik",
"tree_method": "hist",
"learning_rate": 0.05,
"aft_loss_distribution": "normal",
"aft_loss_distribution_scale": 1.20,
"max_depth": 6,
"lambda": 0.01,
"alpha": 0.02,
}
bst = xgb.train(
params,
dtrain,
num_boost_round=10000,
evals=[(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=50,
)

# Run prediction on the validation set
df = pd.DataFrame({'Label (lower bound)': y_lower_bound[valid_index],
'Label (upper bound)': y_upper_bound[valid_index],
'Predicted label': bst.predict(dvalid)})
df = pd.DataFrame(
{
"Label (lower bound)": y_lower_bound[valid_index],
"Label (upper bound)": y_upper_bound[valid_index],
"Predicted label": bst.predict(dvalid),
}
)
print(df)
# Show only data points with right-censored labels
print(df[np.isinf(df['Label (upper bound)'])])
print(df[np.isinf(df["Label (upper bound)"])])

# Save trained model
bst.save_model('aft_model.json')
bst.save_model("aft_model.json")
106 changes: 68 additions & 38 deletions demo/aft_survival/aft_survival_demo_with_optuna.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,108 @@
using Optuna to tune hyperparameters

"""

import numpy as np
import optuna
import pandas as pd
from sklearn.model_selection import ShuffleSplit

import xgboost as xgb
from sklearn.model_selection import ShuffleSplit

# The Veterans' Administration Lung Cancer Trial
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
df = pd.read_csv('../data/veterans_lung_cancer.csv')
print('Training data:')
df = pd.read_csv("../data/veterans_lung_cancer.csv")
print("Training data:")
print(df)

# Split features and labels
y_lower_bound = df['Survival_label_lower_bound']
y_upper_bound = df['Survival_label_upper_bound']
X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1)
y_lower_bound = df["Survival_label_lower_bound"]
y_upper_bound = df["Survival_label_upper_bound"]
X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1)

# Split data into training and validation sets
rs = ShuffleSplit(n_splits=2, test_size=.7, random_state=0)
rs = ShuffleSplit(n_splits=2, test_size=0.7, random_state=0)
train_index, valid_index = next(rs.split(X))
dtrain = xgb.DMatrix(X.values[train_index, :])
dtrain.set_float_info('label_lower_bound', y_lower_bound[train_index])
dtrain.set_float_info('label_upper_bound', y_upper_bound[train_index])
dtrain.set_float_info("label_lower_bound", y_lower_bound[train_index])
dtrain.set_float_info("label_upper_bound", y_upper_bound[train_index])
dvalid = xgb.DMatrix(X.values[valid_index, :])
dvalid.set_float_info('label_lower_bound', y_lower_bound[valid_index])
dvalid.set_float_info('label_upper_bound', y_upper_bound[valid_index])
dvalid.set_float_info("label_lower_bound", y_lower_bound[valid_index])
dvalid.set_float_info("label_upper_bound", y_upper_bound[valid_index])

# Define hyperparameter search space
base_params = {'verbosity': 0,
'objective': 'survival:aft',
'eval_metric': 'aft-nloglik',
'tree_method': 'hist'} # Hyperparameters common to all trials
base_params = {
"verbosity": 0,
"objective": "survival:aft",
"eval_metric": "aft-nloglik",
"tree_method": "hist",
} # Hyperparameters common to all trials


def objective(trial):
params = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution',
['normal', 'logistic', 'extreme']),
'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
'max_depth': trial.suggest_int('max_depth', 3, 8),
'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)} # Search space
params = {
"learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
"aft_loss_distribution": trial.suggest_categorical(
"aft_loss_distribution", ["normal", "logistic", "extreme"]
),
"aft_loss_distribution_scale": trial.suggest_loguniform(
"aft_loss_distribution_scale", 0.1, 10.0
),
"max_depth": trial.suggest_int("max_depth", 3, 8),
"lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
"alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
} # Search space
params.update(base_params)
pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'valid-aft-nloglik')
bst = xgb.train(params, dtrain, num_boost_round=10000,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
early_stopping_rounds=50, verbose_eval=False, callbacks=[pruning_callback])
pruning_callback = optuna.integration.XGBoostPruningCallback(
trial, "valid-aft-nloglik"
)
bst = xgb.train(
params,
dtrain,
num_boost_round=10000,
evals=[(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=50,
verbose_eval=False,
callbacks=[pruning_callback],
)
if bst.best_iteration >= 25:
return bst.best_score
else:
return np.inf # Reject models with < 25 trees


# Run hyperparameter search
study = optuna.create_study(direction='minimize')
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200)
print('Completed hyperparameter tuning with best aft-nloglik = {}.'.format(study.best_trial.value))
print(
"Completed hyperparameter tuning with best aft-nloglik = {}.".format(
study.best_trial.value
)
)
params = {}
params.update(base_params)
params.update(study.best_trial.params)

# Re-run training with the best hyperparameter combination
print('Re-running the best trial... params = {}'.format(params))
bst = xgb.train(params, dtrain, num_boost_round=10000,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
early_stopping_rounds=50)
print("Re-running the best trial... params = {}".format(params))
bst = xgb.train(
params,
dtrain,
num_boost_round=10000,
evals=[(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=50,
)

# Run prediction on the validation set
df = pd.DataFrame({'Label (lower bound)': y_lower_bound[valid_index],
'Label (upper bound)': y_upper_bound[valid_index],
'Predicted label': bst.predict(dvalid)})
df = pd.DataFrame(
{
"Label (lower bound)": y_lower_bound[valid_index],
"Label (upper bound)": y_upper_bound[valid_index],
"Predicted label": bst.predict(dvalid),
}
)
print(df)
# Show only data points with right-censored labels
print(df[np.isinf(df['Label (upper bound)'])])
print(df[np.isinf(df["Label (upper bound)"])])

# Save trained model
bst.save_model('aft_best_model.json')
bst.save_model("aft_best_model.json")
1 change: 0 additions & 1 deletion demo/aft_survival/aft_survival_viz_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import matplotlib.pyplot as plt
import numpy as np

import xgboost as xgb

plt.rcParams.update({"font.size": 13})
Expand Down
2 changes: 1 addition & 1 deletion demo/c-api/external-memory/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ In the example, we define a custom data iterator with 2 methods: `reset` and `ne
its end, and the `reset` method resets iterations. One important detail when using the C
API for data iterator is users need to make sure that the data passed into `next` method
must be kept in memory until the next iteration or `reset` is called. The external memory
DMatrix is not limited to training, but also valid for other features like prediction.
DMatrix is not limited to training, but also valid for other features like prediction.
1 change: 0 additions & 1 deletion demo/dask/cpu_survival.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from xgboost import dask as dxgb
from xgboost.dask import DaskDMatrix

Expand Down
1 change: 0 additions & 1 deletion demo/dask/cpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from dask import array as da
from dask.distributed import Client, LocalCluster

from xgboost import dask as dxgb
from xgboost.dask import DaskDMatrix

Expand Down
5 changes: 2 additions & 3 deletions demo/dask/dask_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
from typing import Any

import numpy as np
import xgboost as xgb
import xgboost.dask as dxgb
from dask.distributed import Client, LocalCluster
from dask_ml.datasets import make_regression
from dask_ml.model_selection import train_test_split

import xgboost as xgb
import xgboost.dask as dxgb
from xgboost.dask import DaskDMatrix


Expand Down
1 change: 0 additions & 1 deletion demo/dask/dask_learning_to_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from dask import dataframe as dd
from distributed import Client, LocalCluster, wait
from sklearn.datasets import load_svmlight_file

from xgboost import dask as dxgb


Expand Down
1 change: 0 additions & 1 deletion demo/dask/forward_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from dask import array as da
from dask_cuda import LocalCUDACluster
from distributed import Client

from xgboost import dask as dxgb
from xgboost.callback import EvaluationMonitor

Expand Down
1 change: 0 additions & 1 deletion demo/dask/gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from dask import dataframe as dd
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

from xgboost import dask as dxgb
from xgboost.dask import DaskDMatrix

Expand Down
1 change: 0 additions & 1 deletion demo/dask/sklearn_cpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from dask import array as da
from dask.distributed import Client, LocalCluster

from xgboost import dask as dxgb


Expand Down
1 change: 0 additions & 1 deletion demo/dask/sklearn_gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

# It's recommended to use dask_cuda for GPU assignment
from dask_cuda import LocalCUDACluster

from xgboost import dask as dxgb


Expand Down
Loading