library(MASS) # needed to use mvrnorm
set.seed(12345)
## GENERATING COVARIATE AND TREATMENT DATA
## Matching treatment units to control units from a reservoir
n_treatment = 50
n_control = 150
## Suppose the covariates have the same covariance but different means
## depending on whether or not they get treatment
x_rho = 0.5
x_sigma = rbind(c(1,x_rho),c(x_rho,1))
x_mean_control = c(0,0)
x_mean_treatment = c(1,1)
x_control = mvrnorm(n = n_control, mu = x_mean_control, Sigma = x_sigma)
x_treatment = mvrnorm(n = n_treatment, mu = x_mean_treatment, Sigma = x_sigma)
x_all = rbind(x_control, x_treatment)
z_all = c(rep(0, n_control), rep(1, n_treatment))
## model the propensity
propensity_model = glm(z_all ~ x_all[,1] + x_all[,2], family = binomial)
propensity_scores = propensity_model$fitted.values
# Match each treatment unit to a control unit
# via nearest-neighbor in the propensity score
pair_indices = rep(NA, n_treatment)
for (treatment_iter in 1:n_treatment) {
this_propensity = propensity_scores[n_control + treatment_iter]
pair_indices[treatment_iter] = which.min(abs(propensity_scores[1:n_control] - this_propensity))
}
####################################
## VISUALIZATION
unit_pch = c(rep("O",n_control),rep("X",n_treatment))
## Just plot the covariates, showing who did/didn't get treatment
plot(x_all[,1], x_all[,2], pch = unit_pch, xlab = expression(x[1]), ylab = expression(x[2]))
# plot(x_all[,1], x_all[,2], pch = unit_pch, xlab = expression(x[1]), ylab = expression(x[2]), cex.axis = 2, cex.lab = 2, cex = 3, main = "Distribution of Covariates", cex.main = 2)
## pause here; now, color them by propensity scores
red_to_blue_palette = rev(rainbow(25))[10:1]
propensity_cols = red_to_blue_palette[ceiling(propensity_scores*10)]
plot(x_all[,1], x_all[,2], col = propensity_cols, pch = unit_pch, main = "Colored by Propensity Score", xlab = expression(x[1]), ylab = expression(x[2]),)
# plot(x_all[,1], x_all[,2], col = propensity_cols, pch = unit_pch, main = "Colored by Propensity Score", xlab = expression(x[1]), ylab = expression(x[2]), cex.axis = 2, cex.lab = 2, cex = 3, cex.main = 2)
## pause here, now, draw arrows to indicate the matches
plot(x_all[,1], x_all[,2], col = propensity_cols, pch = unit_pch, main = "Propensity Score Matching, With Replacement", xlab = expression(x[1]), ylab = expression(x[2]))
arrows(x0 = x_treatment[,1], y0 = x_treatment[,2], x1 = x_all[pair_indices,1], y1 = x_all[pair_indices,2], length = 0.1, code = 2)
# plot(x_all[,1], x_all[,2], col = propensity_cols, pch = unit_pch, main = "Propensity Score Matching, With Replacement", xlab = expression(x[1]), ylab = expression(x[2]), cex.axis = 2, cex.lab = 2, cex = 3, cex.main = 2)
# arrows(x0 = x_treatment[,1], y0 = x_treatment[,2], x1 = x_all[pair_indices,1], y1 = x_all[pair_indices,2], length = 0.1, code = 2, lwd = 3)
####################################
## NOW ACTUALLY CONSIDER THE RESPONSE...
true_effect_size = 3
## confounding: dependence of r on first covariate
beta_x_1 = 2 # no confounding
# beta_x_1 = 2 # confounding
# response given control
r_control = rnorm(n_control, mean = beta_x_1*x_control[,1], sd = 0.25)
# response given treatment
r_treatment = rnorm(n_treatment, mean = true_effect_size + beta_x_1*x_treatment[,1], sd = 0.25)
# estimate the effect size by taking the difference in means
mean(r_treatment - r_control[pair_indices])
## ideally, this would be closer to the effect size than...
mean(r_treatment) - mean(r_control)
## hey, it works...