ACIC2016


# function to simulate data
simulate_data <- function(truth, covariates, n, random.seed, sigma_y = 1) {

    # set seed
    random.seed <- list(seed = random.seed, kind = "Mersenne-Twister", normal.kind = "Inversion", sample.kind = "Rounding")
    suppressWarnings(do.call("set.seed", random.seed))
    
    # sample data n = 100, 500, 2500
    selected_sample <- sample(seq_along(truth$mu.0), n, replace = FALSE)
    mu.0 = truth$mu.0[selected_sample]
    mu.1 = truth$mu.1[selected_sample]
    x = covariates[selected_sample,]
    
    # assign treatment
    n1=round(n/2)
    n0=n-n1
    zind=sample(1:n,size=n1)
    z=numeric(n)
    z[zind]=1
    mu = z * mu.1 + (1-z) * mu.0
    y <- mu + sigma_y*rnorm(n)

    return(
        list(
            x = x,
            z = z,
            y = y,
            mu.1 = mu.1,
            mu.0 = mu.0
        )
    )
}

# function to get true GATE
trueGATE <- function(truth, tau, ngates = 5) {
  n = length(tau)
  fd_label = ntile(tau, ngates)
  gates = numeric(ngates)
  for (i in 1:ngates) {
    gates[i] = mean(truth$mu.1[fd_label == i] - truth$mu.0[fd_label == i])
  }
  return(gates)
}

# function to get true GATE via cross-fitting
compute_true_GATE <- function(truth, input_x, full_data, n_sample, nfolds, ngates, iter) {
  n_train = round(n_sample*(nfolds-1)/nfolds)
  train_data = simulate_data(truth, input_x, n_train, iter)

  cf_train = causal_forest(
    model.matrix(~.-1, data = train_data$x),
    train_data$y,
    train_data$z,
    num.trees = 10,
    num.threads = 1
  )

  tau_test = predict(cf_train, newdata = model.matrix(~.-1, data = full_data$x))$predictions
  gate_test = trueGATE(full_data, tau_test, ngates)

  return(tibble(
    iter = iter,
    group = 1:ngates,
    true_gate = gate_test  # FIX: Rename here to match your join logic
  ))
}

make_synth_acic_population <- function(n_pop = 5000, seed = 1,
                                      p_cont = 10, p_bin = 10,
                                      sigma_mu = 1) {
  set.seed(seed)

  # correlated continuous covariates
  rho <- 0.4
  Sigma <- rho ^ abs(outer(seq_len(p_cont), seq_len(p_cont), "-"))
  Xc <- matrix(rnorm(n_pop * p_cont), n_pop, p_cont) %*% chol(Sigma)
  colnames(Xc) <- paste0("x", seq_len(p_cont))

  # binary covariates correlated with continuous ones
  Xb <- matrix(0, n_pop, p_bin)
  colnames(Xb) <- paste0("b", seq_len(p_bin))
  for (j in seq_len(p_bin)) {
    lin <- 0.6 * Xc[, ((j - 1) %% p_cont) + 1] - 0.3 * Xc[, ((j) %% p_cont) + 1] + rnorm(n_pop, sd = 0.5)
    pr <- plogis(lin)
    Xb[, j] <- rbinom(n_pop, 1, pr)
  }

  # a couple categorical covariates (factors)
  cut3 <- cut(Xc[, 3], breaks = quantile(Xc[, 3], probs = c(0, 1/3, 2/3, 1)),
              include.lowest = TRUE, labels = c("A", "B", "C"))
  cut4 <- cut(Xc[, 4], breaks = quantile(Xc[, 4], probs = c(0, 0.25, 0.5, 0.75, 1)),
              include.lowest = TRUE, labels = c("L1", "L2", "L3", "L4"))

  x <- data.frame(
    Xc,
    Xb,
    cat1 = factor(cut3),
    cat2 = factor(cut4)
  )

  # define mu0(x) and heterogeneous tau(x) with nonlinearities and interactions
  mu0 <- 0.5 * Xc[, 1] -
    0.25 * (Xc[, 2]^2) +
    sin(Xc[, 3]) +
    0.6 * Xb[, 1] +
    0.4 * (cut3 == "B") -
    0.3 * (cut3 == "C") +
    0.5 * Xc[, 5] * Xb[, 2] +
    0.3 * pmax(Xc[, 6], 0)

  tau <- 0.2 +
    0.35 * Xc[, 1] +
    0.25 * (Xc[, 7] > 0) +
    0.2 * Xb[, 3] -
    0.25 * Xc[, 2] * Xb[, 1] +
    0.3 * (cut4 %in% c("L3", "L4")) +
    0.15 * cos(Xc[, 8])

  mu0 <- sigma_mu * mu0
  tau <- sigma_mu * tau
  mu1 <- mu0 + tau

  truth <- list(
    mu.0 = as.numeric(mu0),
    mu.1 = as.numeric(mu1)
  )

  list(x = x, truth = truth)
}

# function to compute GATE bounds with michael's original code
compute_GATE_bounds <- function(truth, input_x, n_sample, nfolds, ngates, n_iter) {

  # get training data
  full_data = simulate_data(truth, input_x, n_sample, n_iter)

  # create folds
  folds = caret::createFolds(full_data$y, k = nfolds)
  indcv = numeric(n_sample)
  tauCV = matrix(0, nrow = n_sample, ncol = nfolds)

  for (i in 1:nfolds) {
    x_full = model.matrix(~.-1, data = full_data$x)
    x_train = x_full[-folds[[i]], ]
    y_train = full_data$y[-folds[[i]]]
    z_train = full_data$z[-folds[[i]]]

    x_test = x_full[folds[[i]], ]
    y_test = full_data$y[folds[[i]]]
    z_test = full_data$z[folds[[i]]]

    # fit causal forest
    cf_train = causal_forest(
      x_train,
      y_train,
      z_train,
      num.trees = 20,
      num.threads = 1
    )

    # compute tau
    tauCV[, i] = predict(cf_train, x_full)$predictions 
    indcv[folds[[i]]] = rep(i, nrow(x_test))
  }
  
  # compute GATE
  output = GATEcv(full_data$z, tauCV, full_data$y, indcv, ngates = ngates)

  
  return(
    list(
      group = 1:ngates,
      gate = output$gate,
      sd = output$sd,
      upper = output$gate + 1.96 * output$sd,
      lower = output$gate - 1.96 * output$sd
    )
  )
}

# function to compute GATE bounds with evalHTE package
compute_GATE_bounds_pkg <- function(truth, input_x, n_sample, nfolds, ngates, n_iter, algs) {
  # get training data
  full_data = simulate_data(truth, input_x, n_sample, n_iter)

  # create data
  df = data.frame(
    full_data$x,
    y = full_data$y,
    z = full_data$z
  )

  # specify formula
  cov_names = input_x %>% names()
  user_formula <- paste0("y ~ z*(", paste(cov_names, collapse = " + "), ")")

  # run with evalITR package
  fit <- estimate_hte(
    treatment = "z",
    form = user_formula,
    data = df,
    algorithms = algs,
    n_folds = nfolds,
    meta_learner = "slearner")

  # evaluate HTE
  est <- evaluate_hte(fit)
  gate_pkg <- summary(est)$GATE %>% 
      mutate(iter = n_iter)

  return(gate_pkg)
  
}

# function to loop over n_sim for true GATE
run_simluation_gate <- function(truth, input_x, full_data, n_sample, nfolds, ngates, n_sim) {

  # loop over n_sim
  results = future_map(1:n_sim, ~compute_true_GATE(truth, input_x, full_data, n_sample, nfolds, ngates, .x),
    .options = furrr_options(seed = 123)
  ) %>% bind_rows()

  return(results)
}


# function to loop over n_sim for gate sd
run_simluation_gate_sd <- function(truth, input_x, n_sample, nfolds, ngates, n_sim, pkg = FALSE, algs = c("causal_forest")) {

    # if pkg == FALSE, compute GATE bounds using michael's original code
    if (pkg == FALSE) {
        results = future_map(1:n_sim, ~compute_GATE_bounds(truth, input_x, n_sample, nfolds, ngates, .x),
        .options = furrr_options(seed = 123)
  ) %>% bind_rows()

    } else {
        # if pkg == TRUE, compute GATE bounds using evalITR package
        results = future_map(1:n_sim, ~compute_GATE_bounds_pkg(truth, input_x, n_sample, nfolds, ngates, .x, algs = algs),
        .options = furrr_options(seed = 123)
  ) %>% bind_rows()

    }

  return(results)
}

Sys.setenv(
OMP_NUM_THREADS = 1,
OPENBLAS_NUM_THREADS = 1,
MKL_NUM_THREADS = 1,
VECLIB_MAXIMUM_THREADS = 1
)
library(evalHTE)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(future)
future::plan(future::sequential)
library(furrr)
library(magrittr)
# load ACIC 2016 data
library(grf)      # causal_forest
options(grf.num.threads = 1)
library(dplyr)    # ntile, left_join, mutate, summarise
library(tibble)
synth <- make_synth_acic_population(n_pop = 5000, seed = 28, p_cont = 10, p_bin = 10)

input_x <- synth$x
truth <- synth$truth

full_data <- simulate_data(truth, input_x, nrow(input_x), 2021)

# parameters
n_sample = 50
n_sim = 5
n_folds = 2
n_gates = 2

# compute true GATE
results_gate = run_simluation_gate(truth, input_x, full_data, n_sample, n_folds, n_gates, n_sim)

# compute GATE bounds
results_sd_pkg = run_simluation_gate_sd(truth, input_x, n_sample, n_folds, n_gates, n_sim, pkg = TRUE, algs = c("causal_forest"))
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...

# check coverage
results_sd_pkg %>%
  left_join(results_gate, by = c("iter", "group")) %>%
  mutate(
    upper = estimate + 1.96 * std.deviation,
    lower = estimate - 1.96 * std.deviation,
    coverage = ((upper > true_gate) & (lower < true_gate)) * 1
  ) %>%
  summarise(
    coverage = mean(coverage, na.rm = TRUE)
  )
#> # A tibble: 1 × 1
#>   coverage
#>      <dbl>
#> 1      0.9

mirror server hosted at Truenetwork, Russian Federation.