More efficient sampling, for large EFIM.S

Kss2k · Dec 9, 2024 · fd695e4 · fd695e4
1 parent 781b015
commit fd695e4
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 31 deletions.
diff --git a/R/calc_se_da.R b/R/calc_se_da.R
@@ -12,6 +12,7 @@ calcFIM_da <- function(model,
                        NA__ = -999,
                        EFIM.S = 3e4,
                        epsilon = 1e-8,
+                       R.max = 1e6,
                        verbose = FALSE) {
   if (!calc.se) return(list(FIM = NULL, vcov = NULL, vcov.sub = NULL, type = "none",
                             raw.labels = names(theta), n.additions = 0))
@@ -24,15 +25,17 @@ calcFIM_da <- function(model,
                                   epsilon = epsilon, hessian = hessian),
           expected = calcEFIM_LMS(model, finalModel = finalModel, theta = theta, 
                                   data = data, epsilon = epsilon, S = EFIM.S, 
-                                  parametric = EFIM.parametric, verbose = verbose),
+                                  parametric = EFIM.parametric, verbose = verbose,
+                                  R.max = R.max),
           stop2("FIM must be either expected or observed")),
      qml =
        switch(FIM,
           observed = calcOFIM_QML(model, theta = theta, data = data,
                                   hessian = hessian, epsilon = epsilon),
           expected = calcEFIM_QML(model, finalModel = finalModel, theta = theta, 
                                   data = data, epsilon = epsilon, S = EFIM.S,
-                                  parametric = EFIM.parametric, verbose = verbose),
+                                  parametric = EFIM.parametric, verbose = verbose,
+                                  R.max = R.max),
           stop2("FIM must be either expected or observed")),
      stop2("Unrecognized method: ", method)
   )
@@ -138,42 +141,51 @@ calcOFIM_LMS <- function(model, theta, data, hessian = FALSE,
 
 
 calcEFIM_LMS <- function(model, finalModel = NULL, theta, data, S = 3e4,
-                         parametric = TRUE, epsilon = 1e-6, verbose = FALSE) {
-  N <- nrow(data)
+                         parametric = TRUE, epsilon = 1e-6, verbose = FALSE,
+                         R.max = 1e6) {
+  N      <- nrow(model$data)
+  R.ceil <- N * S > R.max
+  R      <- ifelse(R.ceil, yes = R.max, no = N * S) 
+
+  if (R < N) {
+    warning2("Population size is smaller than sample size, please increase it using the `R.max` argument!")
+    R <- N
+  }
 
   if (parametric) {
     if (is.null(finalModel)) stop2("finalModel must be included in calcEFIM_LMS")
     parTable   <- modelToParTable(finalModel, method = "lms")
     population <- tryCatch(
-      simulateDataParTable(parTable, N = S * N, colsOVs = colnames(data))$oV,
+      simulateDataParTable(parTable, N = R, colsOVs = colnames(data))$oV,
       error = function(e) {
         warning2("Unable to simulate data for EFIM, using stochastic sampling instead")
         calcEFIM_LMS(model = model, theta = theta, data = data, S = S, 
                      parametric = FALSE, epsilon = epsilon)
       })
 
-  } else {
-    population <- data[sample(N * S, N * S, replace = TRUE), ]
-  }
+  } else population <- data[sample(R, N, replace = TRUE), ]
 
   I <- matrix(0, nrow = length(theta), ncol = length(theta))
 
   P <- estepLms(model = model, theta=theta, data = population)
   for (i in seq_len(S)) {
     if (verbose) printf("\rMonte-Carlo: Iteration = %d/%d", i, S)
 
-    n1 <- (i - 1) * N + 1
-    nn <- n1 + N - 1
+    if (!R.ceil) {
+      n1  <- (i - 1) * N + 1
+      nn  <- n1 + N - 1
+      sub <- n1:nn
+    } else sub <- sample(R, N)
 
-    J <- gradientLogLikLms(theta = theta, model = model, data = population[n1:nn, ],
-                           P = P[n1:nn, ], sign = 1, epsilon = epsilon)
+    J <- gradientLogLikLms(theta = theta, model = model, data = population[sub, ],
+                           P = P[sub, ], sign = 1, epsilon = epsilon)
 
     I <- I + J %*% t(J)
   }
 
   if (verbose) {
     cat("\n")
-    if (S <= 100) cat("Consider increasing the iterations, using the `EFIM.S` argument!\n")
+    if (S <= 100) message("Consider increasing the Monte-Carlo Monte-Carloiterations, using the `EFIM.S` argument!")
   }
 
   I / S
@@ -199,37 +211,50 @@ calcOFIM_QML <- function(model, theta, data, hessian = FALSE,
 
 
 calcEFIM_QML <- function(model, finalModel = NULL, theta, data, S = 100,
-                         parametric = TRUE, epsilon = 1e-8, verbose = FALSE) {
-  N <- nrow(model$data)
+                         parametric = TRUE, epsilon = 1e-8, verbose = FALSE,
+                         R.max = 1e6) {
+  N      <- nrow(model$data)
+  R.ceil <- N * S > R.max
+  R      <- ifelse(R.ceil, yes = R.max, no = N * S) 
+
+  if (R < N) {
+    warning2("Population size is smaller than sample size, please increase it using the `R.max` argument!")
+    R <- N
+  }
 
   if (parametric) {
     if (is.null(finalModel)) stop2("finalModel must be included in calcEFIM_QML")
     parTable <- modelToParTable(finalModel, method = "qml")
     population <- tryCatch(
-      simulateDataParTable(parTable, N = N * S, colsOVs = colnames(data))$oV,
+      simulateDataParTable(parTable, N = R, colsOVs = colnames(data))$oV,
       error = function(e) {
         warning2("Unable to simulate data for EFIM, using stochastic sampling instead")
         calcEFIM_QML(model = model, theta = theta, data = data, S = S, 
                      parametric = FALSE, epsilon = epsilon)
-      })
-  } else {
-    population <- data[sample(N * S, N * S, replace = TRUE), ]
-  }
+      }
+    )
+
+  } else population <- data[sample(R, N, replace = TRUE), ]
+
 
   I <- matrix(0, nrow = length(theta), ncol = length(theta))
   for (i in seq_len(S)) {
     if (verbose) printf("\rMonte-Carlo: Iteration = %d/%d", i, S)
-    n1 <- (i - 1) * N + 1
-    nn <- n1 + N - 1
+
+    if (!R.ceil) {
+      n1  <- (i - 1) * N + 1
+      nn  <- n1 + N - 1
+      sub <- n1:nn
+    } else sub <- sample(R, N)
 
     J <- gradientLogLikQml(theta = theta, model = model, sign = 1, epsilon = epsilon,
-                           data = population[n1:nn, ])
+                           data = population[sub, ])
     I <- I + J %*% t(J)
   }
 
   if (verbose) {
     cat("\n")
-    if (S <= 100) cat("Consider increasing the iterations, using the `EFIM.S` argument!\n")
+    if (S <= 100) message("Consider increasing the Monte-Carlo iterations, using the `EFIM.S` argument!")
   }
 
   I / S

diff --git a/R/est_lms.R b/R/est_lms.R
@@ -13,6 +13,7 @@ emLms <- function(model,
                   epsilon = 1e-6,
                   optimizer = "nlminb",
                   fix.estep = TRUE,
+                  R.max = 1e6,
                   ...) {
   data <- model$data
   stopif(anyNA(data), "Remove or replace missing values from data")
@@ -106,7 +107,7 @@ emLms <- function(model,
                     hessian = OFIM.hessian, calc.se = calc.se,
                     EFIM.parametric = EFIM.parametric, verbose = verbose,
                     FIM = FIM, robust.se = robust.se, epsilon = epsilon,
-                    NA__ = -999)
+                    R.max = R.max, NA__ = -999)
   SE <- calcSE_da(calc.se = calc.se, FIM$vcov, rawLabels = FIM$raw.labels,
                   NA__ = -999)
   modelSE <- getSE_Model(model, se = SE, method = "lms",

diff --git a/R/est_qml.R b/R/est_qml.R
@@ -10,6 +10,7 @@ estQml <- function(model,
                    robust.se = FALSE,
                    epsilon = 1e-6,
                    optimizer = "nlminb",
+                   R.max = 1e6,
                    ...) {
   startTheta <- model$theta
   final <- mstepQml(model = model, theta = startTheta, max.iter = max.iter,
@@ -36,7 +37,7 @@ estQml <- function(model,
                     hessian = OFIM.hessian, calc.se = calc.se,
                     EFIM.parametric = EFIM.parametric, verbose = verbose,
                     FIM = FIM, robust.se = robust.se, NA__ = -999,
-                    epsilon = epsilon)
+                    epsilon = epsilon, R.max = R.max)
   SE <- calcSE_da(calc.se = calc.se, FIM$vcov, rawLabels = FIM$raw.labels,
                   NA__ = -999)
   modelSE <- getSE_Model(model, se = SE, method = "qml",

diff --git a/R/method_settings_da.R b/R/method_settings_da.R
@@ -17,6 +17,7 @@ getMethodSettingsDA <- function(method, args = NULL) {
                    EFIM.S = 100,
                    EFIM.parametric = TRUE,
                    robust.se = FALSE,
+                   R.max = 1e6,
                    max.iter = 500,
                    max.step = 1,
                    fix.estep = TRUE,
@@ -40,6 +41,7 @@ getMethodSettingsDA <- function(method, args = NULL) {
                    EFIM.S = 100,
                    EFIM.parametric = TRUE,
                    robust.se = FALSE,
+                   R.max = 1e6,
                    max.iter = 500,
                    max.step = NULL,
                    fix.estep = NULL,

diff --git a/R/modsem_da.R b/R/modsem_da.R
@@ -72,6 +72,9 @@
 #' from the model), or non-parametrically (stochastically sampled)? If you believe that
 #' normality assumptions are violated, \code{EFIM.parametric = FALSE} might be the better option.
 #'
+#' @param R.max Maximum population size (not sample size) used in the calculated of the expected
+#' fischer information matrix.
+#'
 #' @param robust.se should robust standard errors be computed? Meant to be used for QML,
 #' can be unreliable with the LMS approach.
 #'
@@ -178,6 +181,7 @@ modsem_da <- function(model.syntax = NULL,
                       OFIM.hessian = NULL,
                       EFIM.parametric = NULL,
                       robust.se = NULL,
+                      R.max = NULL,
                       max.iter = NULL,
                       max.step = NULL,
                       fix.estep = NULL,
@@ -221,6 +225,7 @@ modsem_da <- function(model.syntax = NULL,
           OFIM.hessian = OFIM.hessian,
           EFIM.parametric = EFIM.parametric,
           robust.se = robust.se,
+          R.max = R.max,
           max.iter = max.iter,
           max.step = max.step,
           fix.estep = fix.estep,
@@ -230,12 +235,10 @@ modsem_da <- function(model.syntax = NULL,
         )
     )
 
-  if (!method %in% c("lms", "qml")) {
-    stop2("Method must be either 'lms' or 'qml'")
-  }
+  stopif(!method %in% c("lms", "qml"), "Method must be either 'lms' or 'qml'")
 
   if (args$center.data) {
-    data <- lapplyDf(data, FUN = function(x) x - mean(x))
+    data <- lapplyDf(data, FUN = function(x) x - mean(x, na.rm = TRUE))
   }
   if (args$standardize.data) {
     data <- lapplyDf(data, FUN = scaleIfNumeric, scaleFactor = FALSE)
@@ -280,6 +283,7 @@ modsem_da <- function(model.syntax = NULL,
       max.iter = args$max.iter,
       epsilon = args$epsilon,
       optimizer = args$optimizer,
+      R.max = args$R.max,
       ...
     ),
     "lms" = emLms(model,
@@ -296,6 +300,7 @@ modsem_da <- function(model.syntax = NULL,
       epsilon = args$epsilon,
       optimizer = args$optimizer,
       fix.estep = args$fix.estep,
+      R.max = args$R.max,
       ...
     )
   )

diff --git a/man/modsem_da.Rd b/man/modsem_da.Rd
diff --git a/tests/testthat/test_lms.R b/tests/testthat/test_lms.R
@@ -14,7 +14,7 @@ m1 <- "
 "
 
 est1 <- modsem(m1, oneInt, method = "lms", optimize = TRUE, verbose = TRUE,
-               convergence = 1e-2)
+               convergence = 1e-2, R.max = 50000)
 plot_interaction("X", "Z", "Y", "X:Z", -3:3, c(-0.5, 0.5), est1)
 print(summary(est1, adjusted.stat = TRUE))