Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# varimpact 1.2.5 (development version)

## Bug fixes

* Fixed Issue #8: Continuous outcome estimates are now correctly transformed back to the original scale instead of being reported on the [0, 1] scale. This makes results much more interpretable for users working with continuous outcomes. The fix modifies `estimate_pooled_results()` to accept bounds information and transform final estimates using the inverse of the TMLE transformation: `thetas = thetas * diff(Qbounds) + Qbounds[1]`.

## Internal changes

* Added `Qbounds` and `map_to_ystar` parameters to `estimate_pooled_results()` function
* Updated `vim_numerics()` and `vim_factors()` to pass bounds information for continuous outcome transformation
* Enhanced influence curve calculations to use original scale values

# varimpact 1.2.4

Previous releases...
25 changes: 22 additions & 3 deletions R/estimate_pooled_results.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
estimate_pooled_results = function(fold_results,
fluctuation = "logistic",
verbose = FALSE) {
verbose = FALSE,
Qbounds = NULL,
map_to_ystar = FALSE) {
# Fold results is a list with test results from each fold.

# Each fold result should have at least this element:
Expand Down Expand Up @@ -134,13 +136,30 @@ estimate_pooled_results = function(fold_results,

# Estimate treatment-specific mean parameter on every validation fold.
thetas = tapply(Q_star, data$fold_num, mean, na.rm = TRUE)

# Transform thetas back to original scale if needed
if (map_to_ystar && !is.null(Qbounds)) {
if (verbose) cat("Transforming thetas back to original scale using Qbounds:", Qbounds, "\n")
thetas = thetas * diff(Qbounds) + Qbounds[1]
}

if (verbose) cat(thetas, "\n")

# Take average across folds to get final estimate.
#theta = mean(thetas)

# Transform Q_star back to original scale if needed for influence curve calculation
if (map_to_ystar && !is.null(Qbounds)) {
Q_star_original = Q_star * diff(Qbounds) + Qbounds[1]
# Also transform Y_star back to original scale for influence curve
data$Y_original = data$Y_star * diff(Qbounds) + Qbounds[1]
} else {
Q_star_original = Q_star
data$Y_original = data$Y_star
}

# Move Q_star into the data so that it can be analyzed per-fold.
data$Q_star = Q_star
data$Q_star = Q_star_original
rm(Q_star)

if (verbose) cat("Calculating per-fold influence curves\n")
Expand All @@ -156,7 +175,7 @@ estimate_pooled_results = function(fold_results,
"Q_star:", length(Q_star), "\n"))
}
#with(fold_data, (A / g1W_hat) * (Y - Q_star) + Q_star - theta)
result = with(fold_data, (A / g1W_hat) * (Y_star - Q_star) +
result = with(fold_data, (A / g1W_hat) * (Y_original - Q_star) +
Q_star - mean(Q_star, na.rm = TRUE))
#if (verbose) cat("Result:", class(result), "Length:", length(result), "\n")
result
Expand Down
30 changes: 27 additions & 3 deletions R/vim-factors.R
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,18 @@ vim_factors =
# bin_df can be NULL if the variable is skipped due to errors,
# e.g. lack of variation.
if (!is.null(bin_df) && nrow(bin_df) > 0L) {
pooled_bin = estimate_pooled_results(bin_list, verbose = verbose)
# Determine if we need to transform back to original scale
map_to_ystar = FALSE
if (!is.null(Qbounds) && length(Qbounds) == 2) {
# Check if this is a continuous outcome (not binary)
if (family == "binomial" && length(unique(Y)) > 2) {
map_to_ystar = TRUE
}
}

pooled_bin = estimate_pooled_results(bin_list, verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)
# Now we have $thetas and $influence_curves

# Save the vector of estimates into the appropriate spot.
Expand Down Expand Up @@ -741,13 +752,26 @@ vim_factors =

# TODO: compile results into the new estimate.

# Determine if we need to transform back to original scale
map_to_ystar = FALSE
if (!is.null(Qbounds) && length(Qbounds) == 2) {
# Check if this is a continuous outcome (not binary)
if (family == "binomial" && length(unique(Y)) > 2) {
map_to_ystar = TRUE
}
}

if (verbose) cat("Estimating pooled min.\n")
pooled_min = estimate_pooled_results(lapply(fold_results, function(x) x$level_min),
verbose = verbose)
verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)
cat("\n")
if (verbose) cat("Estimating pooled max.\n")
pooled_max = estimate_pooled_results(lapply(fold_results, function(x) x$level_max),
verbose = verbose)
verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)
cat("\n")

var_results$EY0V = pooled_min$thetas
Expand Down
22 changes: 19 additions & 3 deletions R/vim-numerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,9 @@ vim_numerics =
# e.g. lack of variation.

if (!is.null(bin_df) && nrow(bin_df) > 0L) {
pooled_bin = estimate_pooled_results(bin_list, verbose = verbose)
pooled_bin = estimate_pooled_results(bin_list, verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)
# Now we have $thetas and $influence_curves

# Save the vector of estimates into the appropriate spot.
Expand Down Expand Up @@ -790,10 +792,24 @@ vim_numerics =

# TODO: compile results into the new estimate.

# Determine if we need to transform back to original scale
# For continuous outcomes, the estimates are on [0,1] scale and need to be transformed back
map_to_ystar = FALSE
if (!is.null(Qbounds) && length(Qbounds) == 2) {
# Check if this is a continuous outcome (not binary)
if (family == "binomial" && length(unique(Y)) > 2) {
map_to_ystar = TRUE
}
}

pooled_min = estimate_pooled_results(lapply(fold_results, function(x) x$level_min),
verbose = verbose)
verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)
pooled_max = estimate_pooled_results(lapply(fold_results, function(x) x$level_max),
verbose = verbose)
verbose = verbose,
Qbounds = Qbounds,
map_to_ystar = map_to_ystar)

var_results$EY0V = pooled_min$thetas
var_results$EY1V = pooled_max$thetas
Expand Down
124 changes: 124 additions & 0 deletions tests/testthat/test-continuous-outcome-scale.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
library(varimpact)
library(testthat)

context("Continuous outcome scale transformation")

test_that("continuous outcomes are reported on original scale, not [0,1] scale", {
# Set multicore-compatible seed
set.seed(42, "L'Ecuyer-CMRG")

# Create test dataset with continuous outcome in a known range
N <- 100
num_vars <- 3
X <- as.data.frame(matrix(rnorm(N * num_vars), N, num_vars))
colnames(X) <- paste0("X", 1:num_vars)

# Create a continuous outcome with values roughly between 20 and 80
# This gives us a clear range that's far from [0, 1]
Y_continuous <- 50 + 15 * (.3*X[, 1] + .2*X[, 2] - .1*X[, 3]) + rnorm(N, 0, 3)

# Verify our outcome is in the expected range
expect_true(min(Y_continuous) > 10, "Y should be well above 0")
expect_true(max(Y_continuous) < 100, "Y should be well below 100")
expect_true(max(Y_continuous) - min(Y_continuous) > 10, "Y should have substantial range")

# Run varimpact with gaussian family
# Use sequential execution and simple libraries for faster testing
future::plan("sequential")

vim <- varimpact(Y = Y_continuous,
data = X,
family = "gaussian",
V = 2L, # Use fewer folds for faster testing
verbose = FALSE,
Q.library = c("SL.mean", "SL.glm"),
g.library = c("SL.mean", "SL.glm"),
bins_numeric = 2L) # Fewer bins for faster testing

# Check that the function completed successfully
expect_s3_class(vim, "varimpact")
expect_true(!is.null(vim$results_all))

# Extract estimates
estimates <- vim$results_all$Estimate
estimates <- estimates[!is.na(estimates)]

# The key test: estimates should NOT be on [0, 1] scale
# If the bug exists, all estimates would be between 0 and 1
# With the fix, estimates should be on the original scale

# Check that not all estimates are in [0, 1] range
# (some estimates might legitimately be in [0, 1] by chance, but not all)
estimates_in_01_range <- estimates >= 0 & estimates <= 1
proportion_in_01 <- mean(estimates_in_01_range)

# If more than 90% of estimates are in [0, 1], likely the bug still exists
expect_lt(proportion_in_01, 0.9,
paste("Too many estimates in [0,1] range. Estimates:",
paste(round(estimates, 3), collapse = ", ")))

# Additional check: the range of estimates should be reasonable
# relative to the original outcome range
original_range <- max(Y_continuous) - min(Y_continuous)
estimate_range <- max(estimates) - min(estimates)

# The estimate range should be a reasonable fraction of the original range
# (not tiny like it would be if stuck on [0, 1] scale)
expect_gt(estimate_range, original_range * 0.01,
"Estimate range seems too small relative to original outcome range")

# Test fold-specific estimates if available
if (!is.null(vim$results_by_fold)) {
fold_est_cols <- grep("Est_v", colnames(vim$results_by_fold), value = TRUE)
if (length(fold_est_cols) > 0) {
fold_estimates <- as.matrix(vim$results_by_fold[, fold_est_cols])
fold_estimates <- fold_estimates[!is.na(fold_estimates)]

if (length(fold_estimates) > 0) {
fold_estimates_in_01 <- fold_estimates >= 0 & fold_estimates <= 1
fold_proportion_in_01 <- mean(fold_estimates_in_01)

expect_lt(fold_proportion_in_01, 0.9,
"Too many fold estimates in [0,1] range")
}
}
}
})

test_that("binary outcomes still work correctly", {
# Set seed for reproducibility
set.seed(43, "L'Ecuyer-CMRG")

# Create test dataset with binary outcome
N <- 100
X <- as.data.frame(matrix(rnorm(N * 2), N, 2))
colnames(X) <- c("X1", "X2")

# Create binary outcome
Y_binary <- rbinom(N, 1, plogis(.5*X[, 1] - .3*X[, 2]))

# Run varimpact with binomial family
future::plan("sequential")

vim <- varimpact(Y = Y_binary,
data = X,
family = "binomial",
V = 2L,
verbose = FALSE,
Q.library = c("SL.mean", "SL.glm"),
g.library = c("SL.mean", "SL.glm"))

# Check that the function completed successfully
expect_s3_class(vim, "varimpact")
expect_true(!is.null(vim$results_all))

# For binary outcomes, estimates should be in a reasonable range
# (typically between -1 and 1 for risk differences)
estimates <- vim$results_all$Estimate
estimates <- estimates[!is.na(estimates)]

if (length(estimates) > 0) {
expect_true(all(estimates >= -1 & estimates <= 1),
"Binary outcome estimates should be reasonable risk differences")
}
})
Loading