hidgenclassifier/man/screen_variant_mi.Rd at master · c7rishi/hidgenclassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/screen_variant_mi.R
\name{screen_variant_mi}
\alias{screen_variant_mi}
\alias{variant_screen_mi}
\title{Mutual Information based feature screening of variants from a mutation
annotation file}
\usage{
screen_variant_mi(
  maf,
  variant_col = "variant",
  cancer_col = "cancer",
  sample_id_col = "sample",
  equal_cancer_prob_mi = TRUE,
  return_prob_mi = TRUE,
  mi_rank_thresh = 250,
  normalize_mi = FALSE,
  do_freq_screen = FALSE,
  thresh_freq_screen = 1/length(unique(maf[[sample_id_col]])),
  ...
)

variant_screen_mi(
  maf,
  variant_col = "variant",
  cancer_col = "cancer",
  sample_id_col = "sample",
  equal_cancer_prob_mi = TRUE,
  return_prob_mi = TRUE,
  mi_rank_thresh = 250,
  normalize_mi = FALSE,
  do_freq_screen = FALSE,
  thresh_freq_screen = 1/length(unique(maf[[sample_id_col]])),
  ...
)
}
\arguments{
\item{maf}{mutation annotation file --
a data frame-like object with at least three columns containing
variant labels, sample IDs, and cancer sites associated with the sample IDs.
NOTE: uniqueness of rows of maf is assumed.}

\item{variant_col}{name of the column in \code{maf} containing variant labels.}

\item{cancer_col}{name of the column in \code{maf} that corresponds to cancer
sites for the tumor samples.}

\item{sample_id_col}{name of the column in \code{maf} containing tumor sample IDs.}

\item{equal_cancer_prob_mi}{logical. Should the marginal probabilities of
cancer sites be assumed equal (i.e., uniform) while computing mutual
information? If \code{FALSE}, the relative frequencies of cancer sites in
maf are used. CAUTION: the (sample) relative frequencies of cancer sites
in \code{maf} may not necessarily be good approximations of the truth.}

\item{return_prob_mi}{logical. Should the computed mutual information and the
cancer site specific probabilities for these screened variants be returned?
Defaults to TRUE.}

\item{mi_rank_thresh}{rank threshold for screening variants. The top variants
with rank(MI_values) <= mi_rank_thresh is returned. Defaults to 250.}

\item{normalize_mi}{logical. Should mutual information be normalized by
product of square-roots of marginal Shannon entropies? Defaults to FALSE.}

\item{do_freq_screen}{logical. Should an overall (relative) frequency-based
screening be performed prior to MI based screening?
This may reduce the computation load substantially for whole genome
data where potentially tens of  millions of variants are observed only once. Defaults
to FALSE.}

\item{thresh_freq_screen}{Threshold for overall pan-cancer relative frequency
to use if a frequency-based screening is performed before mi based
screening. Defaults to 1/n_sample where n_sample is the pan-cancer
total number of tumors. Ignored if \code{do_freq_screen = FALSE}.}

\item{...}{Unused.}
}
\value{
a character vector listing the screened variant labels (sorted with the first
one having the highest MI) with ranks <= \code{mi_rank_thresh}.
Optionally, if \code{return_prob_mi = TRUE}, then
a data table named \code{prob_mi} listing cancer site specific probabilities
of ALL variants and the associated MIs are returned.
}
\description{
Mutual Information based feature screening of variants from a mutation
annotation file
}
\details{
The function first estimates via relative frequencies the cancer site
specific probabilities of encountering EACH variant in the maf file. Then using
these estimated probabilities and the marginal probabilities of cancer sites,
the (possibly normalized) mutual information between (a) the occurrence of a
variant-"j" in randomly chosen tumor and (b) the cancer site of the associated
tumor is computed for each variant-j in \code{maf}.
These MIs are then ranked and the variant labels associated with with
mi rank <= \code{mi_rank_thresh}  are returned.
}
\examples{
data("impact")
top_v <- screen_variant_mi(
  maf = impact,
  variant_col = "Variant",
  cancer_col = "CANCER_SITE",
  sample_id_col = "patient_id",
  mi_rank_thresh = 200,
  return_prob_mi = FALSE
)
top_v


}