# Optimizing matching and analysis combinations for estimating causal effects # K. Ellicott Colson, Kara E. Rudolph, Scott C. Zimmerman, Dana E. Goin, Elizabeth E. Stuart, Mark van der Laan, Jennifer Ahern # Scientific Reports # This script provides example code for the (I) data generating mechanisms, (II) matching methods, and (III) analysis procedures # implemented in this paper. ############################################################# # I: Data generation ############################################################# #-------------- # generateData function: # input: # n: desired sample size # pscore.type: type of propensity score. Reflects the degree of support. # Select 1 for good support, 2 for medium support, and 3 for poor support. # output: full data #------------------ generateData<- function(n, pscore.type = 1) { # Generate baseline covariates: W1 <- runif(n, min = 0.02, max = 0.7) W2 <- rnorm(n, mean = (0.2 + 0.125*W1), sd = 1) # Generate binary treatment indicator if (pscore.type == 1) { A <- rbinom(n, 1, plogis(-.5 + .5*W1 + .1*I(W1^2) + .5*I(W1*W2) - .5*W2)) } else if (pscore.type == 2) { A <- rbinom(n, 1, plogis(-.7 + 1.8*W1 -.1*I(W1^2) + 1.7*I(W1*W2) - 1.4*W2)) } else if (pscore.type == 3) { A <- rbinom(n, 1, plogis(-.3 + 2*W1 -2*I(W1^2) + 2*I(W1*W2) - 2.5*W2)) } # Generate the potential outcomes Y.0 <- rnorm(n, mean = (-.5 + 2*poly(W1,2) - W2), sd=1) Y.1 <- rnorm(n, mean = (-.5 + 2*poly(W1,2) - W2 + 1.5 + 2*poly(W1,2) - W2), sd=1) # Generate the observed outcome Y <- ifelse(A == 1, Y.1, Y.0) return(data.frame(W1,W2,A,Y,Y.1,Y.0)) } # To create a simulated sample of size 1000 with good support: sample <- generateData(n = 1000, pscore.type = 1) summary(sample) # To create a simulated sample of size 1000 with medium support: sample <- generateData(n = 1000, pscore.type = 2) summary(sample) # To create a simulated sample of size 1000 with poor support: sample <- generateData(n = 1000, pscore.type = 3) summary(sample) ############################################################# # II: Implementation of matching procedures ############################################################# ######################## # Inverse probability of treatment weighting (IPTW) ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Estimate the propensity score, either parametrically or semi-parametrically # Parametric estimation of the propensity score model <- glm(A ~ W1 + W2, data = sample, family='binomial') pscore <- predict(model, type='response') # Semi-parametric estimation of the propensity score SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Then create weights to be used in the analysis step to estimate the ATT. sample$weights <- ifelse(sample$A==1, 1, pscore/(1-pscore)) ######################## # Greedy nearest neighbor matching ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Estimate the propensity score, either parametrically or semi-parametrically # Parametric estimation of the propensity score model <- glm(A ~ W1 + W2, data = sample, family='binomial') pscore <- predict(model, type='response') # Semi-parametric estimation of the propensity score SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. # Note that a warning is produced because we have included a custom propensity score, rather than the default. # This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit(). match <- matchit(A ~ W1 + W2, data = sample, method = "nearest", replace = T, distance = pscore) matched.data <- match.data(match) ######################## # Optimal nearest neighbor matching ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Estimate the propensity score, either parametrically or semi-parametrically # Parametric estimation of the propensity score model <- glm(A ~ W1 + W2, data = sample, family='binomial') pscore <- predict(model, type='response') # Semi-parametric estimation of the propensity score SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. # Note that 1:1 optimal nearest neighbor matching without replacement only works if there are more control units # than treated units. # Note that two warnings are produced. The first is because we have included a custom propensity score, rather than the default. # This warning can be ignored. The second refers to replicability of the procedure which can be addressed by setting a seed # prior to implementation. If desired, you may suppress this warning using suppressWarnings() around the call to matchit(). set.seed(123) match <- matchit(A ~ W1 + W2, data = sample, method = "optimal", distance = pscore) matched.data <- match.data(match) ######################## # Genetic matching ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Note that the genetic matching procedure can take some time, particularly for larger population sizes. # To implement this procedure parametrically: match <- matchit(A ~ W1 + W2, data = sample, method = "genetic", print.level = 0, pop.size = 1000) matched.data <- match.data(match) # This matched data can be used in the analysis step to estimate the ATT. # To implement this semi-parametrically, first estimate the propensity score semi-parametrically: SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. # Note that a warning is produced because we have included a custom propensity score, rather than the default. # This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit(). match <- matchit(A ~ W1 + W2, data = sample, method = "genetic", print.level = 0, pop.size = 1000, distance = pscore) matched.data <- match.data(match) ######################## # Subclassification ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Estimate the propensity score, either parametrically or semi-parametrically # Parametric estimation of the propensity score model <- glm(A ~ W1 + W2, data = sample, family='binomial') pscore <- predict(model, type='response') # Semi-parametric estimation of the propensity score SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Select the desired number of subclasses n.subclass <- 10 # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. # To do so, implement the analysis procedure within each subclass. Then combine estimates across subclasses # by taking the weighted mean of the estimates with weights equal to the number of treated units in each subclass # (see below). # Note that a warning is produced because we have included a custom propensity score, rather than the default. # This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit(). match <- matchit(A ~ W1 + W2, data = sample, method = "subclass", subclass = n.subclass, distance = pscore) matched.data <- match.data(match) ######################## # Full matching ######################## # Generate a sample dataset sample <- generateData(n = 1000, pscore.type = 1) # Load required packages: require("MatchIt") require("SuperLearner") # Estimate the propensity score, either parametrically or semi-parametrically # Parametric estimation of the propensity score model <- glm(A ~ W1 + W2, data = sample, family='binomial') pscore <- predict(model, type='response') # Semi-parametric estimation of the propensity score SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data SL.library = SL.library, # specify the algorithms family = 'binomial', # specify the data type cvControl = list(V=10)) # specify 10-fold cross-validation pscore <- SL.out$SL.predict # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. # Note that two warnings are produced. The first is because we have included a custom propensity score, rather than the default. # This warning can be ignored. The second refers to replicability of the procedure which can be addressed by setting a seed # prior to implementation. If desired, you may suppress this warning using suppressWarnings() around the call to matchit(). set.seed(123) match <- matchit(A ~ W1 + W2, data = sample, method = "full", distance = pscore) matched.data <- match.data(match) ############################################################# # III. Implementation of analysis procedures ############################################################# # Generate a sample dataset or pull the matched dataset data <- generateData(n = 1000, pscore.type = 1) data <- matched.data ######################## # Naive analysis ######################## # First, identify the indices of treated and control units in the analysis dataset tx.indices <- (data$A==1) control.indices <- (data$A==0) # If no weighted is needed, calculate the ATT as the difference in means ATT <- mean(data$Y[tx.indices]) - mean(data$Y[control.indices]) ATT # If weights must be incorporated, as in inverse probability of treatment weighting, # calculate the ATT as the difference in weighted means ATT <- weighted.mean(data$Y[tx.indices], w = data$weights[tx.indices]) - weighted.mean(data$Y[control.indices], w = data$weights[control.indices]) ATT ######################## # G-computation ######################## # If the analysis does NOT need to be weighted (as in IPTW), set all weights to 1. data$weights <- 1 # First, identify the indices of treated and control units in the analysis dataset tx.indices <- (data$A==1) control.indices <- (data$A==0) # Then create datasets representing the counterfactual scenarios where all units are treated, or all units are untreated cf.treated <- cf.control <- data cf.treated$A <- 1 cf.control$A <- 0 # To apply parametric g-computation with a mispecified linear model: model <- glm(Y ~ A + W2 + W1, data = data, weights = data$weights) EY1 <- mean(predict(model, newdata = cf.treated[tx.indices,], type='response')) EY0 <- mean(predict(model, newdata = cf.control[control.indices,], type='response')) ATT <- EY1 - EY0 ATT ######################## # Targeted minimum loss-based estimation (TMLE) ######################## # If the analysis does NOT need to be weighted (as in IPTW), set all weights to 1. data$weights <- 1 # Load the modified version of TMLE that incorporates weights. setwd("C:/Users/kecolson/Documents/sample code") source("TMLE_ATT_edited.R") # Scale the outcome variable to meet the format requirements of TMLE for ATT. data$Y.scaled <- (data$Y - min(data$Y)) / (max(data$Y) - min(data$Y)) # To implement TMLE for ATT with a mispecified parametric model: tmle.out <- tmle.att2(Y = data$Y.scaled, A = data$A, W = data[,c("W1","W2")], family = "gaussian", Delta = rep(1,nrow(data)), gDelta.method = "user", gDelta.1 = 1/data$weights, g.method = "glm", g.formula = A ~ W1 + W2, Q.method = "glm", Q.formula = Y ~ A + W1 + W2) ATT <- tmle.out$psi * (max(data$Y) - min(data$Y)) # Extract the ATT estimate and transform back to the original scale ATT # To implement TMLE for ATT with semi-parametric estimation: require("SuperLearner") SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries tmle.out<- tmle.att2(Y = data$Y.scaled, A = data$A, W = data[,c("W1","W2")], family = "gaussian", Delta = rep(1,nrow(data)), gDelta.method = "user", gDelta.1 = 1/data$weights, g.method = "SL", g.SL.library = SL.library, Q.method = "SL", Q.SL.library = SL.library) ATT <- tmle.out$psi * (max(data$Y) - min(data$Y)) # Extract the ATT estimate and transform back to the original scale ATT ######################## # Analysis under subclassification ######################## # To estimate the ATT using data that has been subclassified, implement the analysis procedure within each subclass. # Then combine estimates across subclasses by taking the weighted mean of the estimates with weights equal to the number # of treated units in each subclass. By default, the matchit() function produces subclasses with the same number of # treated units in each subclass, so in this case, an unweighted mean is appropriate. # For example, for a weighted naive analysis of data with 10 subclasses, do the following: n.subclass <- 10 # Specify the number of subclasses estimates <- rep(NA, n.subclass) for (s in 1:n.subclass) { subdata <- data[data$subclass==s,] tx.indices <- (subdata$A==1) control.indices <- (subdata$A==0) EY1 <- weighted.mean(subdata$Y[tx.indices], w = subdata$weights[tx.indices]) EY0 <- weighted.mean(subdata$Y[control.indices], w = subdata$weights[control.indices]) estimates[s] <- EY1 - EY0 } ATT <- mean(estimates) ATT # END