-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfit_rfc.Rd
More file actions
120 lines (108 loc) · 3.09 KB
/
fit_rfc.Rd
File metadata and controls
120 lines (108 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fit_predict_rfc.R
\name{fit_rfc}
\alias{fit_rfc}
\alias{fit_rf}
\title{Hidden genome random forest classifier (rfc)}
\usage{
fit_rfc(
X,
Y,
backend = "ranger",
tune = TRUE,
mtry = NULL,
n_mtry = 6,
max.depth = c(0, 10^(-4:1)),
num.trees = 1000,
...
)
fit_rf(
X,
Y,
backend = "ranger",
tune = TRUE,
mtry = NULL,
n_mtry = 6,
max.depth = c(0, 10^(-4:1)),
num.trees = 1000,
...
)
}
\arguments{
\item{X}{data design matrix with observations across rows and predictors across
columns. For a typical hidden genome classifier each row represents a tumor and
the columns represent (possibly normalized by some functions of
the total mutation burden in tumors) binary 1-0 presence/absence indicators
of raw variants, counts of mutations at specific genes and counts of mutations
corresponding to specific mutation signatures etc.}
\item{Y}{character vector or factor denoting the cancer type of tumors whose
mutation profiles are listed across the rows of \code{X}.}
\item{backend}{Which backend to use? Available options are
"ranger" and "randomForest" corresponding to the respective R packages.
NOTE: randomForest does not support sparseMatrix, and the predictor matrix
is coerced into an ordinary matrix. This means using randomForest will likely
be more memory intensive and hence slower than ranger. NOTE: {ranger} and {randomForest}
are required to be installed separately.}
\item{tune}{logical. Tune the random forest hyper parameters? Only used if
backend = "ranger". Defaults to TRUE. If TRUE, a list of models are trained with
various mtry and num.trees parameters, and the fitted model with minimum oob
prediction error is returned.}
\item{...}{additional arguments passed to ranger::ranger or randomForest::randomForest (depending on backend).}
}
\description{
Hidden genome random forest classifier (rfc)
}
\details{
Light wrapper around randomForest or ranger to use in hidden
genome classification
}
\examples{
data("impact")
top_v <- variant_screen_mi(
maf = impact,
variant_col = "Variant",
cancer_col = "CANCER_SITE",
sample_id_col = "patient_id",
mi_rank_thresh = 50,
return_prob_mi = FALSE
)
var_design <- extract_design(
maf = impact,
variant_col = "Variant",
sample_id_col = "patient_id",
variant_subset = top_v
)
canc_resp <- extract_cancer_response(
maf = impact,
cancer_col = "CANCER_SITE",
sample_id_col = "patient_id"
)
pid <- names(canc_resp)
# create five stratified random folds
# based on the response cancer categories
set.seed(42)
folds <- data.table::data.table(
resp = canc_resp
)[,
foldid := sample(rep(1:5, length.out = .N)),
by = resp
]$foldid
# 80\%-20\% stratified separation of training and
# test set tumors
idx_train <- pid[folds != 5]
idx_test <- pid[folds == 5]
\dontrun{
# train a classifier on the training set
# using only variants (will have low accuracy
# -- no meta-feature information used)
fit0 <- fit_rfc(
X = var_design[idx_train, ],
Y = canc_resp[idx_train],
tune = FALSE
)
pred0 <- predict_rfc(
fit = fit0,
Xnew = var_design[idx_test, ]
)
}
}