36-462/662, Spring 2022
17 February 222
\[ \newcommand{\Prob}[1]{\mathbb{P}\left( #1 \right)} \newcommand{\Expect}[1]{\mathbb{E}\left[ #1 \right]} \newcommand{\Var}[1]{\mathrm{Var}\left[ #1 \right]} \newcommand{\Cov}[1]{\mathrm{Cov}\left[ #1 \right]} \newcommand{\Risk}{r} \newcommand{\EmpRisk}{\hat{r}} \newcommand{\Loss}{\ell} \newcommand{\OptimalStrategy}{\sigma} \DeclareMathOperator*{\argmin}{argmin} \newcommand{\ModelClass}{S} \newcommand{\OptimalModel}{s^*} \DeclareMathOperator{\tr}{tr} \newcommand{\optimand}{\theta} \newcommand{\optimum}{\optimand^*} \newcommand{\OptimalParameter}{\optimand^*} \newcommand{\OptDim}{{p}} \newcommand{\ERM}{\hat{\optimand}} \newcommand{\ObjFunc}{{M}} \newcommand{\Hessian}{\mathbf{k}} \]
Make sure the training and testing data are independent
Deaths per day in Chicago, from all causes except accidents, plus measurements of air pollution and temperature.
death | pm10median | pm25median | o3median | so2median | time | tmpd | date |
---|---|---|---|---|---|---|---|
130 | -7.4335443 | NA | -19.59234 | 1.9280426 | -2556.5 | 31.5 | 1986-12-31 |
150 | NA | NA | -19.03861 | -0.9855631 | -2555.5 | 33.0 | 1987-01-01 |
101 | -0.8265306 | NA | -20.21734 | -1.8914161 | -2554.5 | 33.0 | 1987-01-02 |
135 | 5.5664557 | NA | -19.67567 | 6.1393413 | -2553.5 | 29.0 | 1987-01-03 |
126 | NA | NA | -19.21734 | 2.2784649 | -2552.5 | 32.0 | 1987-01-04 |
130 | 6.5664557 | NA | -17.63400 | 9.8585839 | -2551.5 | 40.0 | 1987-01-05 |
chicago.lm <- lm(death ~ pm10median + o3median + so2median + tmpd, death = chicago)
chicago$split <- sample(c("train", "test"), size = nrow(chicago), replace = TRUE)
head(chicago$split)
## [1] "train" "test" "train" "test" "train" "test"
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split |
---|---|---|---|---|---|---|---|---|
130 | -7.4335443 | NA | -19.59234 | 1.9280426 | -2556.5 | 31.5 | 1986-12-31 | train |
150 | NA | NA | -19.03861 | -0.9855631 | -2555.5 | 33.0 | 1987-01-01 | test |
101 | -0.8265306 | NA | -20.21734 | -1.8914161 | -2554.5 | 33.0 | 1987-01-02 | train |
135 | 5.5664557 | NA | -19.67567 | 6.1393413 | -2553.5 | 29.0 | 1987-01-03 | test |
126 | NA | NA | -19.21734 | 2.2784649 | -2552.5 | 32.0 | 1987-01-04 | train |
130 | 6.5664557 | NA | -17.63400 | 9.8585839 | -2551.5 | 40.0 | 1987-01-05 | test |
kable(head(chicago[chicago$split == "train", ]))
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split | |
---|---|---|---|---|---|---|---|---|---|
1 | 130 | -7.4335443 | NA | -19.592338 | 1.928043 | -2556.5 | 31.5 | 1986-12-31 | train |
3 | 101 | -0.8265306 | NA | -20.217338 | -1.891416 | -2554.5 | 33.0 | 1987-01-02 | train |
5 | 126 | NA | NA | -19.217338 | 2.278465 | -2552.5 | 32.0 | 1987-01-04 | train |
8 | 109 | -5.4335443 | NA | -12.170527 | -5.107941 | -2549.5 | 29.0 | 1987-01-07 | train |
10 | 153 | NA | NA | -18.580280 | -2.046929 | -2547.5 | 32.5 | 1987-01-09 | train |
11 | 124 | -19.4335443 | NA | -5.712194 | -1.600999 | -2546.5 | 29.5 | 1987-01-10 | train |
kable(head(chicago[chicago$split == "test", ]))
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split | |
---|---|---|---|---|---|---|---|---|---|
2 | 150 | NA | NA | -19.03861 | -0.9855631 | -2555.5 | 33.0 | 1987-01-01 | test |
4 | 135 | 5.5664557 | NA | -19.67567 | 6.1393413 | -2553.5 | 29.0 | 1987-01-03 | test |
6 | 130 | 6.5664557 | NA | -17.63400 | 9.8585839 | -2551.5 | 40.0 | 1987-01-05 | test |
7 | 129 | -0.4335443 | NA | -15.37440 | -5.8189921 | -2550.5 | 34.5 | 1987-01-06 | test |
9 | 125 | -0.5714286 | NA | -20.09234 | 0.1822373 | -2548.5 | 26.5 | 1987-01-08 | test |
12 | 111 | -15.4335443 | NA | -15.62886 | 2.9379306 | -2545.5 | 34.5 | 1987-01-11 | test |
head(rep(1:3, length.out = nrow(chicago)))
## [1] 1 2 3 1 2 3
chicago$fold <- sample(rep(1:3, length.out = nrow(chicago)))
head(chicago$fold)
## [1] 1 1 1 3 1 3
kable(head(chicago[chicago$fold == 1, ]))
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split | fold | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 130 | -7.4335443 | NA | -19.59234 | 1.9280426 | -2556.5 | 31.5 | 1986-12-31 | train | 1 |
2 | 150 | NA | NA | -19.03861 | -0.9855631 | -2555.5 | 33.0 | 1987-01-01 | test | 1 |
3 | 101 | -0.8265306 | NA | -20.21734 | -1.8914161 | -2554.5 | 33.0 | 1987-01-02 | train | 1 |
5 | 126 | NA | NA | -19.21734 | 2.2784649 | -2552.5 | 32.0 | 1987-01-04 | train | 1 |
9 | 125 | -0.5714286 | NA | -20.09234 | 0.1822373 | -2548.5 | 26.5 | 1987-01-08 | test | 1 |
10 | 153 | NA | NA | -18.58028 | -2.0469293 | -2547.5 | 32.5 | 1987-01-09 | train | 1 |
kable(head(chicago[chicago$fold == 2, ]))
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split | fold | |
---|---|---|---|---|---|---|---|---|---|---|
8 | 109 | -5.433544 | NA | -12.170527 | -5.1079414 | -2549.5 | 29.0 | 1987-01-07 | train | 2 |
11 | 124 | -19.433544 | NA | -5.712194 | -1.6009988 | -2546.5 | 29.5 | 1987-01-10 | train | 2 |
15 | 109 | -7.256018 | NA | -18.712194 | -1.1414161 | -2542.5 | 32.5 | 1987-01-14 | train | 2 |
17 | 128 | NA | NA | -17.378861 | 1.8072373 | -2540.5 | 27.0 | 1987-01-16 | test | 2 |
19 | 130 | -9.433544 | NA | -7.634004 | -0.7186133 | -2538.5 | 23.0 | 1987-01-18 | test | 2 |
20 | 133 | -3.433544 | NA | -18.413614 | 0.6816042 | -2537.5 | 20.5 | 1987-01-19 | train | 2 |
kable(head(chicago[chicago$fold == 3, ]))
death | pm10median | pm25median | o3median | so2median | time | tmpd | date | split | fold | |
---|---|---|---|---|---|---|---|---|---|---|
4 | 135 | 5.5664557 | NA | -19.67567 | 6.139341 | -2553.5 | 29.0 | 1987-01-03 | test | 3 |
6 | 130 | 6.5664557 | NA | -17.63400 | 9.858584 | -2551.5 | 40.0 | 1987-01-05 | test | 3 |
7 | 129 | -0.4335443 | NA | -15.37440 | -5.818992 | -2550.5 | 34.5 | 1987-01-06 | test | 3 |
12 | 111 | -15.4335443 | NA | -15.62886 | 2.937931 | -2545.5 | 34.5 | 1987-01-11 | test | 3 |
13 | 104 | 11.5664557 | NA | -17.04553 | 3.641800 | -2544.5 | 34.0 | 1987-01-12 | train | 3 |
18 | 141 | -2.4335443 | NA | -17.00386 | 0.562692 | -2539.5 | 17.5 | 1987-01-17 | train | 3 |
Instead, let’s simulate!
“Synthetic” (i.e., fake) data example, so we know what the real answer is, and can tell which methods do and do not work
cv.lm <- function(data, formulae, nfolds = 5) {
data <- na.omit(data)
formulae <- sapply(formulae, as.formula)
responses <- sapply(formulae, response.name)
names(responses) <- as.character(formulae)
n <- nrow(data)
fold.labels <- sample(rep(1:nfolds, length.out = n))
mses <- matrix(NA, nrow = nfolds, ncol = length(formulae))
colnames <- as.character(formulae)
for (fold in 1:nfolds) {
test.rows <- which(fold.labels == fold)
train <- data[-test.rows, ]
test <- data[test.rows, ]
for (form in 1:length(formulae)) {
current.model <- lm(formula = formulae[[form]], data = train)
predictions <- predict(current.model, newdata = test)
test.responses <- test[, responses[form]]
test.errors <- test.responses - predictions
mses[fold, form] <- mean(test.errors^2)
}
}
return(colMeans(mses))
}
response.name <- function(formula) {
var.names <- all.vars(formula)
return(var.names[1])
}
Arlot, Sylvain, and Alain Celisse. 2010. “A Survey of Cross-Validation Procedures for Model Selection.” Statistics Surveys 4:40–79. https://doi.org/10.1214/09-SS054.
Burman, Prabir, Edmond Chow, and Deborah Nolan. 1994. “A Cross-Validatory Method for Dependent Data.” Biometrika 81:351–58. https://doi.org/10.1093/biomet/81.2.351.
Claeskens, Gerda, and Nils Lid Hjort. 2008. Model Selection and Model Averaging. Cambridge, England: Cambridge University Press.
Geisser, Seymour. 1975. “The Predictive Sample Reuse Method with Applications.” Journal of the American Statistical Association 70:320–28. https://doi.org/10.1080/01621459.1975.10479865.
Geisser, Seymour, and William F. Eddy. 1979. “A Predictive Approach to Model Selection.” Journal of the American Statistical Association 74:153–60. https://doi.org/10.1080/01621459.1979.10481632.
Györfi, László, Michael Kohler, Adam Krzyżak, and Harro Walk. 2002. A Distribution-Free Theory of Nonparametric Regression. New York: Springer-Verlag.
Laan, Mark J. van der, and Sandrine Dudoit. 2003. “Unified Cross-Validation Methodology for Selection Among Estimators and a General Cross-Validated Adaptive Epsilon-Net Estimator: Finite Sample Oracle Inequalities and Examples.” 130. U.C. Berkeley Division of Biostatistics Working Paper Series. http://www.bepress.com/ucbbiostat/paper130/.
Racine, Jeff. 2000. “Consistent Cross-Validatory Model-Selection for Dependent Data: Hv-Block Cross-Validation.” Journal of Econometrics 99:39–61. https://doi.org/10.1016/S0304-4076(00)00030-0.
Stone, M. 1974. “Cross-Validatory Choice and Assessment of Statistical Predictions.” Journal of the Royal Statistical Society B 36:111–47. http://www.jstor.org/stable/2984809.
Thaler, Richard H. 1994. The Winner’s Curse: Paradoxes and Anomalies of Economic Life. Princetonm, New Jersey: Princeton University Press.
Tibshirani, Ryan J., and Robert Tibshirani. 2009. “A Bias Correction for the Minimum Error Rate in Cross-Validation.” Annals of Applied Statistics 3:822–29. http://arxiv.org/abs/0908.2904.
Vaart, Aad W. van der, Sandrine Dudoit, and Mark J. van der Laan. 2006. “Oracle Inequalities for Multi-Fold Cross Validation.” Statistics and Decisions 24:1001–21. https://doi.org/10.1524/stnd.2006/24.3.351.
Wahba, Grace. 1990. Spline Models for Observational Data. Philadelphia: Society for Industrial; Applied Mathematics.
Yang, Yuhong. 2005. “Can the Strengths of AIC and BIC Be Shared? A Conflict Between Model Indentification and Regression Estimation.” Biometrika 92:937–50. https://doi.org/10.1093/biomet/92.4.937.