hidgenclassifier/man/fit_nnc.Rd at master · c7rishi/hidgenclassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fit_predict_nnc.R
\name{fit_nnc}
\alias{fit_nnc}
\alias{fit_nn}
\title{Train a fully-connected multi-class neural network}
\usage{
fit_nnc(
  X,
  Y,
  val_split = 1/3,
  trials = 200,
  epochs = 50,
  batch_size = 128,
  verbose_mbo = T,
  seed = 1
)

fit_nn(
  X,
  Y,
  val_split = 1/3,
  trials = 200,
  epochs = 50,
  batch_size = 128,
  verbose_mbo = T,
  seed = 1
)
}
\arguments{
\item{X}{data design matrix with observations across rows and predictors across
columns. For a typical hidden genome classifier each row represents a tumor and
the columns represent (possibly normalized by some functions of
the total mutation burden in tumors) binary 1-0 presence/absence indicators
of raw variants, counts of mutations at specific genes and counts of mutations
corresponding to specific mutation signatures etc.}

\item{Y}{character vector or factor denoting the cancer type of tumors whose
mutation profiles are listed across the rows of \code{X}.}

\item{val_split}{Fraction of data to be used as validation set for hyperparameters}

\item{trials}{Number of trials for hyperparameter tuning}

\item{epochs}{Number of training epochs}

\item{verbose_mbo}{Bayesian optimization verbosity mode (logical)}

\item{seed}{Random seed}

\item{...}{Unused}
}
\value{
Object of class "nn", a named list of length 7 with the components of the neural network training process
\item{X}{Input matrix}
\item{Y}{Response vector}
\item{map_df}{Dataframe with columns "original" and "numeric". The "original" column contains the original class names in Y and the "numeric" column contains the numeric representation of the classes used during training}
\item{model}{Final Keras model trained on X and Y (see https://keras.rstudio.com/articles/about_keras_models.html for more details)}
\item{ind_val}{Vector of indices of X corresponding to validation set used to tune hyperparameters}
\item{tuning_results}{Named list with the results from the hyperparameter search (output of mbo() from mlrMBO). The list elements include "x", a named list with the best hyperparameters found, and "y", the validation accuracy corresponding to the best hyperparameters. See description of MBOSingleObjResult from mlrMBO for more details.}
\item{preproc}{Named list with the parameters of the min-max pre-processing transformation applied to X prior to training (output of preProcess() from caret)}
}
\description{
This function first splits the data into a training and validation set and tunes hyperparameters using Bayesian optimization (similar to the approach used in Jiao et al. 2020), then uses the best hyperparameters to train a model on the entire dataset.
}
\note{
\enumerate{
\item The function uses packages {keras} and {tensorflow} for fitting neurual networks, which
requires a python environment in the backend. See the installation notes for
the {keras} R package for more details.
\item In addition to {keras} and {tensorflow} the function makes use of
several functions from packages
{caret}, {mlrMBO}, {lhs}, {ParamHelpers}, {smoof}, and {mlr}
under the hood. These packages must be installed separately before using
fit_nnc.
}
}
\examples{
data("impact")
top_v <- variant_screen_mi(
  maf = impact,
  variant_col = "Variant",
  cancer_col = "CANCER_SITE",
  sample_id_col = "patient_id",
  mi_rank_thresh = 50,
  return_prob_mi = FALSE
)
var_design <- extract_design(
  maf = impact,
  variant_col = "Variant",
  sample_id_col = "patient_id",
  variant_subset = top_v
)

canc_resp <- extract_cancer_response(
  maf = impact,
  cancer_col = "CANCER_SITE",
  sample_id_col = "patient_id"
)
pid <- names(canc_resp)
# create five stratified random folds
# based on the response cancer categories
set.seed(42)
folds <- data.table::data.table(
  resp = canc_resp
)[,
  foldid := sample(rep(1:5, length.out = .N)),
  by = resp
]$foldid

# 80\%-20\% stratified separation of training and
# test set tumors
idx_train <- pid[folds != 5]
idx_test <- pid[folds == 5]

\dontrun{
# train a classifier on the training set
# using only variants (will have low accuracy
# -- no meta-feature information used
fit0 <- fit_nnc(
  X = var_design[idx_train, ],
  Y = canc_resp[idx_train],
  trials = 10,
  epochs = 5
)

pred0 <- predict_nnc(
  fit = fit0,
  Xnew = var_design[idx_test, ]
)

}

}
\references{
Jiao W, Atwal G, Polak P, Karlic R, Cuppen E, Danyi A, De Ridder J, van Herpen C, Lolkema MP, Steeghs N, Getz G. A deep learning system accurately classifies primary and metastatic cancers using passenger mutation patterns. Nature communications. 2020 Feb 5;11(1):1-2.
}
\author{
Zoe Guan. Email: guanZ@mskcc.org
}