-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPAV.R
More file actions
182 lines (147 loc) · 6.26 KB
/
PAV.R
File metadata and controls
182 lines (147 loc) · 6.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
############
## PavEdr ##
############
PavEdr <- function(y, X, Z, tuning.parameters = NULL, num.tuning = 300)
{
# Description :
# Ridge tuning parameter calibration based on personalized
# adaptive validation for Euclidean distance Ridge.
# Usage :
# PavEdr(y, X, Z, tuning.parameters = NULL, num.tuning = 300)
# Arguments :
# y : A vector of observations of length n.
# X : A design matrix of dimension n * p.
# Z : A testing matrix of dimension p * t, where each column is a testing
# vector of dimension p.
# tuning.parameters : A vector containing a grid of tuning parameters. The
# default is set to be NULL. If it is NULL, then a
# sequence of vector will be generated automatially.
# num.tuning : The number of tuning parameters if tuning.parameters = NULL.
# The default value is set to be 300.
# Returns :
# beta.hat : A matrix of dimension p * t, where each column is the estimator
# of the regression vector corresponding to the column of Z.
# y.hat : A vector of dimension t, where each element is the estimated value
# of Euclidean distance Ridge corresponding to the column of Z.
# tuning.chosen : A vector of dimension t, where each element is the optimal
# tuning parameter chosen by personalized adaptive
# validation for Euclidean distance Ridge corresponding to
# the column of Z.
# time : The computation time in seconds.
# Set constants
num.obs <- as.numeric(dim(X)[1]) # number of observations
num.par <- as.numeric(dim(X)[2]) # number of parameters
num.data <- as.numeric(dim(Z)[2]) # number of data vectors
# Utility function
EuclideanNorm <- function(x) {
as.numeric(sqrt(t(x) %*% x))
}
##### Step 1 : Generate a set of tuning parameters
# Default tuning parameters
if (is.null(tuning.parameters)) {
power <- seq(from = -3, to = 3, length.out = num.tuning)
tuning.parameters <- 10.0^power
} else {
# If tuning.parameters are given set the
num.tuning <- length(tuning.parameters)
}
# TODO(YD): Catch errors
if(num.tuning < 2) {
stop("number of tuning parameters not sufficient (must be > 1).")
}
##### Step 2 : Compute the ridge solution path
start.time <- Sys.time()
# Singular value decomposition of X
if (num.obs < num.par){
SVD <- svd(X)
U <- SVD$u
V <- SVD$v
} else {
SVD <- svd(X, nu = num.obs, nv = num.par)
U <- SVD$u
V <- SVD$v
}
# Compute the ridge estimator for each tuning parameter
estimators <- matrix(nrow = num.par, ncol = num.tuning)
if (num.obs < num.par){
D.pseudo.Inverse <- matrix(rep(0, num.obs * num.obs), nrow = num.obs,
ncol = num.obs)
} else {
D.pseudo.Inverse <- matrix(rep(0, num.obs * num.par), nrow = num.par,
ncol = num.obs)
}
for(i in 1:num.tuning) {
diag(D.pseudo.Inverse) <- SVD$d / (SVD$d ^ 2 + tuning.parameters[i])
estimators[, i] <- as.vector(V %*% D.pseudo.Inverse %*% t(U) %*% y)
}
##### Step 3 : Transform the ridge tuning parameters to their edr counterparts
# and sort the tuning parameters
# Transform the ridge tuning parameters to edr tuning parameters
tuning.edr <- rep(0, num.tuning)
for (i in 1:num.tuning) {
tuning.edr[i] <- EuclideanNorm(2 * t(X) %*% (y - X %*% estimators[, i]))
}
# Initialize output arrays
tuning.hat <- rep(NA, num.data)
estimator.hat <- matrix(data = NA, nrow = num.par, ncol = num.data)
outcome.hat <- rep(NA, num.data)
# Timing for the part independent of testing vectors
time.const <- as.numeric(difftime(Sys.time(), start.time, units = "sec"))
##### Step 4 : Compute the optimal tuning parameter for each testing vector
start.time <- Sys.time() # Timing for testing dependent part
# Loop over all data vectors
for (k in 1:num.data) {
z <- Z[, k] # data vector
z.norm <- EuclideanNorm(z)
# compute the bounds (1+c[z,r])r/2
bounds <- rep(0, num.tuning)
c.bounds <- rep(0, num.tuning)
for (i in 1:num.tuning) {
c <- as.numeric(abs(z %*% estimators[, i]) /
(z.norm * EuclideanNorm(estimators[, i])))
bounds[i] <- c * tuning.edr[i]
}
# compute an ordering such that the bounds increase
ordering <- sort(bounds,
decreasing = FALSE,
index.return = TRUE,
method = "quick")$ix
bounds.ordered <- bounds[ordering] # Utility vector of ordered bounds
estimators.ordered <- estimators[, ordering]
i.optimal <- NaN
i.optimal.found <- FALSE
for (i in (num.tuning - 1):1) {
# Check the binary decision variables for estimators.ordered[, i]
for (j in (i + 1) : num.tuning) {
# if it does not hold, then the optimal tuning parameter is
# the previous one (i-1)
if (abs(t(z) %*% (estimators.ordered[, i] - estimators.ordered[, j]))
> z.norm * (abs(bounds.ordered[i] + bounds.ordered[j]))) {
i.optimal <- i+1
i.optimal.found <- TRUE
break
}
}
if (i.optimal.found) {
break
}
}
# If no optimal value has been found according to the PAVedr method
# choose the smalles tuning parameter
if(!i.optimal.found) {
i.optimal <- which.min(tuning.parameters[ordering])
# TODO(YD): Output warning?
}
# Output chosen tuning parameter and corresponding estimator and outcome
tuning.hat[k] <- tuning.parameters[ordering][i.optimal]
estimator.hat[, k] <- estimators.ordered[, i.optimal]
outcome.hat[k] <- t(z) %*% estimator.hat[, k]
}
# mean tuning parameter calibration run time per data vector
time.testing <- as.numeric(difftime(Sys.time(), start.time, units = "sec")) / num.data
return(list(beta.hat = estimator.hat,
y.hat = outcome.hat,
tuning.chosen = tuning.hat,
time.const = time.const,
time.testing = time.testing))
}