# Optimizing matching and analysis combinations for estimating causal effects
# K. Ellicott Colson, Kara E. Rudolph, Scott C. Zimmerman, Dana E. Goin, Elizabeth E. Stuart, Mark van der Laan, Jennifer Ahern
# Scientific Reports

# This script provides example code for the (I) data generating mechanisms, (II) matching methods, and (III) analysis procedures
#  implemented in this paper. 


#############################################################
# I: Data generation
#############################################################

#--------------
# generateData function:
#   input: 
#     n: desired sample size
#     pscore.type: type of propensity score. Reflects the degree of support.
#       Select 1 for good support, 2 for medium support, and 3 for poor support.
# 	output: full data
#------------------

generateData<- function(n, pscore.type = 1) {
  
  # Generate baseline covariates:
  W1 <- runif(n, min = 0.02, max = 0.7)  
  W2 <- rnorm(n, mean = (0.2 + 0.125*W1), sd = 1) 
  
  # Generate binary treatment indicator
  if (pscore.type == 1) {
    A <- rbinom(n, 1, plogis(-.5 + .5*W1 + .1*I(W1^2) + .5*I(W1*W2) - .5*W2))
  } else if (pscore.type == 2) {
    A <- rbinom(n, 1, plogis(-.7 + 1.8*W1 -.1*I(W1^2) + 1.7*I(W1*W2) - 1.4*W2))
  } else if (pscore.type == 3) {
    A <- rbinom(n, 1, plogis(-.3 + 2*W1 -2*I(W1^2) + 2*I(W1*W2) - 2.5*W2))
  }
  
  # Generate the potential outcomes
  Y.0 <- rnorm(n, mean = (-.5 + 2*poly(W1,2)  - W2), sd=1)
  Y.1 <- rnorm(n, mean = (-.5 + 2*poly(W1,2) - W2 + 1.5 + 2*poly(W1,2) - W2), sd=1)
 
  # Generate the observed outcome
  Y <-  ifelse(A == 1, Y.1, Y.0)
  
  return(data.frame(W1,W2,A,Y,Y.1,Y.0))
}

# To create a simulated sample of size 1000 with good support:
sample <- generateData(n = 1000, pscore.type = 1)
summary(sample)

# To create a simulated sample of size 1000 with medium support:
sample <- generateData(n = 1000, pscore.type = 2)
summary(sample)

# To create a simulated sample of size 1000 with poor support:
sample <- generateData(n = 1000, pscore.type = 3)
summary(sample)

#############################################################
# II: Implementation of matching procedures
#############################################################


########################
# Inverse probability of treatment weighting (IPTW)
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Estimate the propensity score, either parametrically or semi-parametrically

# Parametric estimation of the propensity score
model <- glm(A ~ W1 + W2, data = sample, family='binomial')
pscore <- predict(model, type='response')

# Semi-parametric estimation of the propensity score
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                       SL.library = SL.library,             # specify the algorithms
                       family = 'binomial',                 # specify the data type
                       cvControl = list(V=10))              # specify 10-fold cross-validation
pscore <- SL.out$SL.predict

# Then create weights to be used in the analysis step to estimate the ATT.
sample$weights <- ifelse(sample$A==1, 1, pscore/(1-pscore))


########################
# Greedy nearest neighbor matching
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Estimate the propensity score, either parametrically or semi-parametrically

# Parametric estimation of the propensity score
model <- glm(A ~ W1 + W2, data = sample, family='binomial')
pscore <- predict(model, type='response')

# Semi-parametric estimation of the propensity score
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                       SL.library = SL.library,             # specify the algorithms
                       family = 'binomial',                 # specify the data type
                       cvControl = list(V=10))              # specify 10-fold cross-validation
pscore <- SL.out$SL.predict

# Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. 
# Note that a warning is produced because we have included a custom propensity score, rather than the default. 
# This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit().
match <- matchit(A ~ W1 + W2, data = sample, method = "nearest", replace = T, distance = pscore)
matched.data <- match.data(match)
  

########################
# Optimal nearest neighbor matching
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Estimate the propensity score, either parametrically or semi-parametrically

# Parametric estimation of the propensity score
model <- glm(A ~ W1 + W2, data = sample, family='binomial')
pscore <- predict(model, type='response')

# Semi-parametric estimation of the propensity score
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                       SL.library = SL.library,             # specify the algorithms
                       family = 'binomial',                 # specify the data type
                       cvControl = list(V=10))              # specify 10-fold cross-validation
pscore <- SL.out$SL.predict

# Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. 
# Note that 1:1 optimal nearest neighbor matching without replacement only works if there are more control units
#  than treated units. 
# Note that two warnings are produced. The first is because we have included a custom propensity score, rather than the default. 
#  This warning can be ignored. The second refers to replicability of the procedure which can be addressed by setting a seed 
#  prior to implementation. If desired, you may suppress this warning using suppressWarnings() around the call to matchit().
set.seed(123)
match <- matchit(A ~ W1 + W2, data = sample, method = "optimal", distance = pscore)
matched.data <- match.data(match)
   

########################
# Genetic matching
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Note that the genetic matching procedure can take some time, particularly for larger population sizes.

# To implement this procedure parametrically:
  match <- matchit(A ~ W1 + W2, data = sample, method = "genetic", print.level = 0, pop.size = 1000)
  matched.data <- match.data(match)
  # This matched data can be used in the analysis step to estimate the ATT. 

# To implement this semi-parametrically, first estimate the propensity score semi-parametrically:
  SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
  SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                         SL.library = SL.library,             # specify the algorithms
                         family = 'binomial',                 # specify the data type
                         cvControl = list(V=10))              # specify 10-fold cross-validation
  pscore <- SL.out$SL.predict
  
  # Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. 
  # Note that a warning is produced because we have included a custom propensity score, rather than the default. 
  # This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit().
  match <- matchit(A ~ W1 + W2, data = sample, method = "genetic", print.level = 0, pop.size = 1000, distance = pscore)
  matched.data <- match.data(match)


########################
# Subclassification
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Estimate the propensity score, either parametrically or semi-parametrically

# Parametric estimation of the propensity score
model <- glm(A ~ W1 + W2, data = sample, family='binomial')
pscore <- predict(model, type='response')

# Semi-parametric estimation of the propensity score
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                       SL.library = SL.library,             # specify the algorithms
                       family = 'binomial',                 # specify the data type
                       cvControl = list(V=10))              # specify 10-fold cross-validation
pscore <- SL.out$SL.predict

# Select the desired number of subclasses
n.subclass <- 10

# Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. 
# To do so, implement the analysis procedure within each subclass. Then combine estimates across subclasses
#  by taking the weighted mean of the estimates with weights equal to the number of treated units in each subclass
#  (see below).
# Note that a warning is produced because we have included a custom propensity score, rather than the default. 
# This warning can be ignored. If desired, you may suppress this warning using suppressWarnings() around the call to matchit().
match <- matchit(A ~ W1 + W2, data = sample, method = "subclass", subclass = n.subclass, distance = pscore)
matched.data <- match.data(match)


########################
# Full matching
########################

# Generate a sample dataset
sample <- generateData(n = 1000, pscore.type = 1)

# Load required packages: 
require("MatchIt")
require("SuperLearner")

# Estimate the propensity score, either parametrically or semi-parametrically

# Parametric estimation of the propensity score
model <- glm(A ~ W1 + W2, data = sample, family='binomial')
pscore <- predict(model, type='response')

# Semi-parametric estimation of the propensity score
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
SL.out <- SuperLearner(Y = sample$A, X = sample[,c("W1","W2")], # specify the data
                       SL.library = SL.library,             # specify the algorithms
                       family = 'binomial',                 # specify the data type
                       cvControl = list(V=10))              # specify 10-fold cross-validation
pscore <- SL.out$SL.predict

# Then implement the matching procedure. The matched data can be used in the analysis step to estimate the ATT. 
# Note that two warnings are produced. The first is because we have included a custom propensity score, rather than the default. 
#  This warning can be ignored. The second refers to replicability of the procedure which can be addressed by setting a seed 
#  prior to implementation. If desired, you may suppress this warning using suppressWarnings() around the call to matchit().
set.seed(123)
match <- matchit(A ~ W1 + W2, data = sample, method = "full", distance = pscore)
matched.data <- match.data(match)


#############################################################
# III. Implementation of analysis procedures
#############################################################

# Generate a sample dataset or pull the matched dataset 
data <- generateData(n = 1000, pscore.type = 1)
data <- matched.data

########################
# Naive analysis 
########################

# First, identify the indices of treated and control units in the analysis dataset
tx.indices <-      (data$A==1)
control.indices <- (data$A==0)

# If no weighted is needed, calculate the ATT as the difference in means
ATT <- mean(data$Y[tx.indices]) - mean(data$Y[control.indices])
ATT

# If weights must be incorporated, as in inverse probability of treatment weighting, 
#  calculate the ATT as the difference in weighted means
ATT <- weighted.mean(data$Y[tx.indices], w = data$weights[tx.indices]) - weighted.mean(data$Y[control.indices], w = data$weights[control.indices])
ATT


########################
# G-computation 
########################

# If the analysis does NOT need to be weighted (as in IPTW), set all weights to 1. 
data$weights <- 1

# First, identify the indices of treated and control units in the analysis dataset
tx.indices <-      (data$A==1)
control.indices <- (data$A==0)

# Then create datasets representing the counterfactual scenarios where all units are treated, or all units are untreated
cf.treated <- cf.control <- data
cf.treated$A <- 1
cf.control$A <- 0

# To apply parametric g-computation with a mispecified linear model:
model <- glm(Y ~ A + W2 + W1, data = data, weights = data$weights)
EY1 <- mean(predict(model, newdata = cf.treated[tx.indices,],      type='response')) 
EY0 <- mean(predict(model, newdata = cf.control[control.indices,], type='response'))
ATT <- EY1 - EY0
ATT


########################
# Targeted minimum loss-based estimation (TMLE) 
########################

# If the analysis does NOT need to be weighted (as in IPTW), set all weights to 1. 
data$weights <- 1

# Load the modified version of TMLE that incorporates weights.
setwd("C:/Users/kecolson/Documents/sample code")
source("TMLE_ATT_edited.R")

# Scale the outcome variable to meet the format requirements of TMLE for ATT. 
data$Y.scaled <- (data$Y - min(data$Y)) / (max(data$Y) - min(data$Y))

# To implement TMLE for ATT with a mispecified parametric model:
tmle.out <- tmle.att2(Y = data$Y.scaled, A = data$A, W = data[,c("W1","W2")], family = "gaussian", Delta = rep(1,nrow(data)), 
                      gDelta.method = "user", gDelta.1 = 1/data$weights, g.method = "glm", g.formula = A ~ W1 + W2, 
                      Q.method = "glm", Q.formula = Y ~ A + W1 + W2)
ATT <- tmle.out$psi * (max(data$Y) - min(data$Y)) # Extract the ATT estimate and transform back to the original scale
ATT

# To implement TMLE for ATT with semi-parametric estimation:
require("SuperLearner")
SL.library <- c("SL.glm", "SL.glm.interaction", "SL.mean", "SL.step", "SL.step.interaction") # Select desired SuperLearner libraries
tmle.out<- tmle.att2(Y = data$Y.scaled, A = data$A, W = data[,c("W1","W2")], family = "gaussian", Delta = rep(1,nrow(data)), 
                       gDelta.method = "user", gDelta.1 = 1/data$weights, g.method = "SL", g.SL.library = SL.library, 
                       Q.method = "SL", Q.SL.library = SL.library)
ATT <- tmle.out$psi * (max(data$Y) - min(data$Y)) # Extract the ATT estimate and transform back to the original scale
ATT


########################
# Analysis under subclassification
########################

# To estimate the ATT using data that has been subclassified, implement the analysis procedure within each subclass. 
# Then combine estimates across subclasses by taking the weighted mean of the estimates with weights equal to the number 
# of treated units in each subclass. By default, the matchit() function produces subclasses with the same number of 
# treated units in each subclass, so in this case, an unweighted mean is appropriate.

# For example, for a weighted naive analysis of data with 10 subclasses, do the following:
n.subclass <- 10 # Specify the number of subclasses
estimates <- rep(NA, n.subclass)
for (s in 1:n.subclass) {
  subdata <- data[data$subclass==s,]
  tx.indices <-      (subdata$A==1)
  control.indices <- (subdata$A==0)
  EY1 <- weighted.mean(subdata$Y[tx.indices],      w = subdata$weights[tx.indices])
  EY0 <- weighted.mean(subdata$Y[control.indices], w = subdata$weights[control.indices])
  estimates[s] <- EY1 - EY0
}
ATT <- mean(estimates)
ATT

# END