personalized_medicine/PAV.R at master · LedererLab/personalized_medicine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
############
## PavEdr ##
############

PavEdr <- function(y, X, Z, tuning.parameters = NULL, num.tuning = 300)
{
  # Description :
  #               Ridge tuning parameter calibration based on personalized
  #               adaptive validation for Euclidean distance Ridge.
  # Usage :
  #         PavEdr(y, X, Z, tuning.parameters = NULL, num.tuning = 300)
  # Arguments :
  #   y : A vector of observations of length n.
  #   X : A design matrix of dimension n * p.
  #   Z : A testing matrix of dimension p * t, where each column is a testing
  #       vector of dimension p.
  #   tuning.parameters : A vector containing a grid of tuning parameters. The
  #                       default is set to be NULL. If it is NULL, then a
  #                       sequence of vector will be generated automatially.
  #   num.tuning : The number of tuning parameters if tuning.parameters = NULL.
  #                The default value is set to be 300.
  # Returns :
  #   beta.hat : A matrix of dimension p * t, where each column is the estimator
  #              of the regression vector corresponding to the column of Z.
  #   y.hat : A vector of dimension t, where each element is the estimated value
  #           of Euclidean distance Ridge corresponding to the column of Z.
  #   tuning.chosen : A vector of dimension t, where each element is the optimal
  #                   tuning parameter chosen by personalized adaptive
  #                   validation for Euclidean distance Ridge corresponding to
  #                   the column of Z.
  #   time : The computation time in seconds.


  # Set constants
  num.obs <- as.numeric(dim(X)[1])  # number of observations
  num.par <- as.numeric(dim(X)[2])  # number of parameters
  num.data <- as.numeric(dim(Z)[2])  # number of data vectors

  # Utility function
  EuclideanNorm <- function(x) {
    as.numeric(sqrt(t(x) %*% x))
  }

  ##### Step 1 : Generate a set of tuning parameters

  # Default tuning parameters
  if (is.null(tuning.parameters)) {
    power <- seq(from = -3, to = 3, length.out = num.tuning)
    tuning.parameters <- 10.0^power
  } else {
    # If tuning.parameters are given set the
    num.tuning <- length(tuning.parameters)
  }
  # TODO(YD): Catch errors
  if(num.tuning < 2) {
    stop("number of tuning parameters not sufficient (must be > 1).")
  }

  ##### Step 2 : Compute the ridge solution path

  start.time <- Sys.time()

  # Singular value decomposition of X
  if (num.obs < num.par){
    SVD <- svd(X)
    U <- SVD$u
    V <- SVD$v
  } else {
    SVD <- svd(X, nu = num.obs, nv = num.par)
    U <- SVD$u
    V <- SVD$v
  }


  # Compute the ridge estimator for each tuning parameter
  estimators <- matrix(nrow = num.par, ncol = num.tuning)

  if (num.obs < num.par){
    D.pseudo.Inverse <- matrix(rep(0, num.obs * num.obs), nrow = num.obs,
    ncol = num.obs)
  } else {
    D.pseudo.Inverse <- matrix(rep(0, num.obs * num.par), nrow = num.par,
    ncol = num.obs)
  }

  for(i in 1:num.tuning) {
    diag(D.pseudo.Inverse) <- SVD$d / (SVD$d ^ 2 + tuning.parameters[i])
    estimators[, i] <- as.vector(V %*% D.pseudo.Inverse %*% t(U) %*% y)
  }


  ##### Step 3 : Transform the ridge tuning parameters to their edr counterparts
  # and sort the tuning parameters

  # Transform the ridge tuning parameters to edr tuning parameters
  tuning.edr <- rep(0, num.tuning)
  for (i in 1:num.tuning) {
    tuning.edr[i] <- EuclideanNorm(2 * t(X) %*% (y - X %*% estimators[, i]))
  }

  # Initialize output arrays
  tuning.hat <- rep(NA, num.data)
  estimator.hat <- matrix(data = NA, nrow = num.par, ncol = num.data)
  outcome.hat <- rep(NA, num.data)

  # Timing for the part independent of testing vectors
  time.const <- as.numeric(difftime(Sys.time(), start.time, units = "sec"))


  ##### Step 4 : Compute the optimal tuning parameter for each testing vector

  start.time <- Sys.time() # Timing for testing dependent part

  # Loop over all data vectors
  for (k in 1:num.data) {

    z <- Z[, k]  # data vector
    z.norm <- EuclideanNorm(z)

    # compute the bounds (1+c[z,r])r/2
    bounds <- rep(0, num.tuning)
    c.bounds <- rep(0, num.tuning)
    for (i in 1:num.tuning) {
      c <- as.numeric(abs(z %*% estimators[, i]) /
      (z.norm * EuclideanNorm(estimators[, i])))
      bounds[i] <- c * tuning.edr[i]
    }

    # compute an ordering such that the bounds increase
    ordering <- sort(bounds,
                     decreasing   = FALSE,
                     index.return = TRUE,
                     method       = "quick")$ix
    bounds.ordered <- bounds[ordering]  # Utility vector of ordered bounds
    estimators.ordered <- estimators[, ordering]


    i.optimal <- NaN
    i.optimal.found <- FALSE
    for (i in (num.tuning - 1):1) {
      # Check the binary decision variables for estimators.ordered[, i]
      for (j in (i + 1) : num.tuning) {
        # if it does not hold, then the optimal tuning parameter is
        # the previous one (i-1)
        if (abs(t(z) %*% (estimators.ordered[, i] - estimators.ordered[, j]))
            > z.norm * (abs(bounds.ordered[i] + bounds.ordered[j]))) {
          i.optimal <- i+1
          i.optimal.found <- TRUE
          break
        }
      }
      if (i.optimal.found) {
        break
      }
    }


    # If no optimal value has been found according to the PAVedr method
    # choose the smalles tuning parameter
    if(!i.optimal.found) {
      i.optimal <- which.min(tuning.parameters[ordering])
      # TODO(YD): Output warning?
    }

    # Output chosen tuning parameter and corresponding estimator and outcome
    tuning.hat[k] <- tuning.parameters[ordering][i.optimal]
    estimator.hat[, k] <- estimators.ordered[, i.optimal]
    outcome.hat[k] <- t(z) %*% estimator.hat[, k]

  }
  # mean tuning parameter calibration run time per data vector
  time.testing <- as.numeric(difftime(Sys.time(), start.time, units = "sec")) / num.data


  return(list(beta.hat = estimator.hat,
              y.hat = outcome.hat,
              tuning.chosen = tuning.hat,
              time.const = time.const,
              time.testing = time.testing))
}