hidgenclassifier/man/fit_smlc.Rd at master · c7rishi/hidgenclassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fit_predict_smlc.R
\name{fit_smlc}
\alias{fit_smlc}
\alias{fit_mlogit}
\title{Hidden genome sparse multinomial logistic classifier (smlc)}
\usage{
fit_smlc(X, Y, grouped = TRUE, alpha = 1, normalize_rows = NULL, ...)

fit_mlogit(X, Y, grouped = TRUE, alpha = 1, normalize_rows = NULL, ...)
}
\arguments{
\item{X}{data design matrix with observations across rows and predictors across
columns. For a typical hidden genome classifier each row represents a tumor and
the columns represent (possibly normalized by some functions of
the total mutation burden in tumors) binary 1-0 presence/absence indicators
of raw variants, counts of mutations at specific genes and counts of mutations
corresponding to specific mutation signatures etc.}

\item{Y}{character vector or factor denoting the cancer type of tumors whose
mutation profiles are listed across the rows of \code{X}.}

\item{grouped}{logical. Use group-lasso penalty instead of the ordinary lasso
penalty? Defaults to TRUE.}

\item{alpha}{The elasticnet mixing parameter. Passed to {cv.glmnet}}

\item{normalize_rows}{vector of the same length as \code{nrow(X)} to be used
to normalize the rows of \code{X}. If NULL (default), no normalization is performed.}

\item{...}{additional arguments passed to \code{cv.glmnet}.}
}
\value{
Returns a list containing the cv.glmnet fitted object,
the original X and Y and the estimated
intercept vector alpha and regression coefficients matrix beta.
}
\description{
Hidden genome sparse multinomial logistic classifier (smlc)
}
\note{
The function is a light wrapper around cv.glmnet with
\code{family = "multinomial"}, and \code{type.multinomial = "grouped"} if
\code{grouped} = TRUE. \code{cv.glmnet} tunes the sparsity hyper-parameter using
cross-validation. \code{fit_smlc} by default uses a 10-fold cross-validation
similar to the default of \code{cv.glmnet} (can be changed by supplying
\code{nfolds} in \code{...}); however with a stratified  random partition
(based on the categories of \code{Y}), instead of the default simple random
partition used in \code{cv.glmnet}. Override this by supplying \code{foldid} to
\code{cv.glmnet} in the \code{...}. In addition, \code{fit_smlc}
sets \code{maxit = 1e6}, \code{trace.it = TRUE} in \code{...} by default
(instead of the default
\code{maxit = 1e5} set in glmnet).
}
\examples{
data("impact")
top_v <- variant_screen_mi(
  maf = impact,
  variant_col = "Variant",
  cancer_col = "CANCER_SITE",
  sample_id_col = "patient_id",
  mi_rank_thresh = 50,
  return_prob_mi = FALSE
)
var_design <- extract_design(
  maf = impact,
  variant_col = "Variant",
  sample_id_col = "patient_id",
  variant_subset = top_v
)

canc_resp <- extract_cancer_response(
  maf = impact,
  cancer_col = "CANCER_SITE",
  sample_id_col = "patient_id"
)
pid <- names(canc_resp)
# create five stratified random folds
# based on the response cancer categories
set.seed(42)
folds <- data.table::data.table(
  resp = canc_resp
)[,
  foldid := sample(rep(1:5, length.out = .N)),
  by = resp
]$foldid

# 80\%-20\% stratified separation of training and
# test set tumors
idx_train <- pid[folds != 5]
idx_test <- pid[folds == 5]

# train a classifier on the training set
# using only variants (will have low accuracy
# -- no meta-feature information used
fit0 <- fit_mlogit(
  X = var_design[idx_train, ],
  Y = canc_resp[idx_train]
)

pred0 <- predict_mlogit(
  fit = fit0,
  Xnew = var_design[idx_test, ]
)


}