diff --git a/README.md b/README.md index d025daa..891ef63 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,32 @@ The parametric Waiting Time Distribution has been developed to provide a data-dr This implementation of estimation procedures mimics that found in the corresponding Stata package (wtdttt). The model is fit using maximum likelihood estimation. +## Example + +```R +library(haven) +df <- read_dta(system.file("extdata", "ranwtddat_discdates.dta", package="Rwtdttt")) + +fit_r <- ranwtdttt(data = df, + rxdate ~ dlnorm(logitp, mu, lnsigma), + id = "pid", + start = as.Date('2014-01-01'), + end = as.Date('2014-12-31'), + reverse = T, robust = F +) + +summary(fit_r) +``` +Please see [examples.R](sandpit/examples.R) for more. + ## More ### Known bugs Robust variance calculation, used by default in the randwtdttt() function, is very slow. We are working on improving this. +Other planned improvements are listed in [TODO](sandpit/TODO) + ### Bug reports, feature requests etc. This is a young project with some rough edges. Please submit any bug reports, feature request and comments as a Github issue. @@ -40,6 +60,8 @@ The package was developed by: * Malcolm Gillies * Olga Paoletti +Thank you to Jesper Hallas for kindly allowing us to reproduce the synthetic dispensing data in the `drugpakud.dta` dataset. + ### License Rwtdttt is licensed under the GNU General Public License, Version 3. diff --git a/inst/extdata/score_ex.xlsx b/inst/extdata/score_ex.xlsx new file mode 100644 index 0000000..9190808 Binary files /dev/null and b/inst/extdata/score_ex.xlsx differ diff --git a/sandpit/d1functions.R b/sandpit/d1functions.R deleted file mode 100644 index 5defc42..0000000 --- a/sandpit/d1functions.R +++ /dev/null @@ -1,60 +0,0 @@ -#' Derivative of WTD log-likelihood wrt omega (=logitp) - -d1domega <- function(dfun, x, logitp, param1, param2 = 0, delta = 1) { - prob <- exp(logitp) / (1 + exp(logitp)) - - - if (dfun == dlnorm) { - mu <- param1 - sigma <- exp(param2) - mean <- exp(mu + exp(2 * lnsigma)/2) - - gfun <- pnorm(log(x), mu, sigma, lower.tail = FALSE) / mean - } - - if (dfun == dweib) { - beta <- exp(param1) - alpha <- exp(param2) - - gfun <- exp(-((x*exp(lnbeta))^exp(lnalpha)) - - lgamma(1+1/exp(lnalpha))) * exp(lnbeta) - } - - if (dfun == dexp) { - beta <- exp(param1) - gfun <- exp(-((x*exp(lnbeta))) ) * exp(lnbeta) - } - - - d1 <- prob * (delta * gfun - 1) / (delta * gfun * exp(logitp) + 1) - - return(d1lomega = d1) -} - -#' Derivative of WTD log-likelihood wrt param1 (and param2) - -d1dtheta <- function(dfun, logitp, param1, param2 = 0, delta = 1) { - prob <- exp(logitp) / (1 + exp(logitp)) - - if (dfun == dlnorm) { - mu <- param1 - # s is also known as lnsigma - s <- param2 - sigma <- exp(param2) - mean <- exp(mu + exp(2 * lnsigma)/2) - - gfun <- pnorm(log(x), mu, sigma, lower.tail = FALSE) / mean - - - y <- exp(-s) * (log(x) - mu) - - d1FRDdmu <-(exp(-s) * dnorm(-y) - pnorm(-y)) / mean - d1FRDds <- (exp(-s) * y * dnorm(-y) - exp(2 * s) * pnorm(-y)) / mean - - d1ldmu <- d1FRDmu / (gfun + exp(-logitp) + 1) - d1lds < d1FRDs / (gfun + exp(-logitp) + 1) - return(d1lmu = d1ldmu, d1lnsigma = d1lds) - } - - -} diff --git a/sandpit/try2.R b/sandpit/examples.R similarity index 99% rename from sandpit/try2.R rename to sandpit/examples.R index 758d822..b78aeab 100644 --- a/sandpit/try2.R +++ b/sandpit/examples.R @@ -1,7 +1,7 @@ # Quick test library(bbmle) -library(Rwtdttt) +#library(Rwtdttt) library(haven) library(data.table) library(tidyverse) @@ -71,7 +71,6 @@ summary(fit1) df <- haven::read_dta(system.file("extdata", "wtddat_covar.dta", package="Rwtdttt")) ### fit waiting time distribution (exp) -# XXXX need to change wtdttt(... subset= ...) to use non-standard evaluation fit3 <- wtdttt(data = df, last_rxtime ~ dexp(logitp, lnbeta), start = 0, end = 1, reverse = T, subset = packsize==200 diff --git a/sandpit/good_practises.R b/sandpit/good_practises.R deleted file mode 100644 index 3211826..0000000 --- a/sandpit/good_practises.R +++ /dev/null @@ -1,31 +0,0 @@ -####################################################### -# # -# Implementing good practises for creating a package # -# # -####################################################### - -# why we moved all .R files out of the R folder? - -# open a new .R file within the R folder, where the new function will be stored -use_r("dlnorm") -# makes functions available, although it does not exist in the global environment -load_all() -# check if function exists in the Global Environment -exists("dlnorm", where = globalenv(), inherits = FALSE) - -# If you see TRUE instead of FALSE, that indicates you’re still using a script-oriented workflow and sourcing your functions. Here’s how to get back on track: - -# 1) Clean out the global environment and restart R. -# 2) Re-attach devtools with library(devtools) and re-load regexcite with load_all(). -# 3) Redefine the test input x and call strsplit1(x, split = ",") again. This should work! -# 4) Run exists("strsplit1", where = globalenv(), inherits = FALSE) again and you should see FALSE - -# gold standard for checking that an R package is in full working order -check() - -# converting dlnorm()’s special comment into man/dlnorm.Rd -# updates the NAMESPACE file, based on @export tags found in roxygen comments -document() - -# preview of the help file -help(dlnorm) diff --git a/sandpit/mlwtdttt_lnorm.R b/sandpit/mlwtdttt_lnorm.R deleted file mode 100644 index f5e0d3d..0000000 --- a/sandpit/mlwtdttt_lnorm.R +++ /dev/null @@ -1,6 +0,0 @@ -mlwtdttt_lnorm <- function(logitp, mu, lnsigma) { - - a <- -sum(log(plogis(logitp) * pnorm(-(log(obstime)-mu)/exp(lnsigma))/exp(mu + exp(2*lnsigma)/2)+(plogis(-logitp))/delta)) - return(a) - -} diff --git a/sandpit/sandwich_estimation_code_malcolm.R b/sandpit/sandwich_estimation_code_malcolm.R deleted file mode 100644 index c78d645..0000000 --- a/sandpit/sandwich_estimation_code_malcolm.R +++ /dev/null @@ -1,80 +0,0 @@ -# proof of concept sandwich estimation - -library(haven) -library(bbmle) -library(numDeriv) - - -tmp <- read_dta("inst/extdata/score_ex.dta") - -tmp$packcat <- factor(tmp$packsize) - -delta <- 1 - -dlnorm2 <- function(x, logitp, mu, lnsigma, log = TRUE) { - - prob <- exp(logitp) / (1 + exp(logitp)) - sigma <- exp(lnsigma) - mean <- exp(mu + exp(2 * lnsigma)/2) - - if (log == FALSE) { - - prob * pnorm(log(x), mu, sigma, lower.tail = FALSE) / mean + (1 - prob) / delta - - } else { - - log(prob * pnorm(log(x), mu, sigma, lower.tail = FALSE) / mean + (1 - prob) / delta) - - } - -} - - -fit1 <- mle2(rxtime ~ dlnorm2(logitp, mu, lnsigma), - parameters = list(logitp ~ packcat, mu ~ packcat, lnsigma ~ packcat), - start = list(logitp = 0, mu = log(delta/5), lnsigma = 0), - data = tmp) - -summary(fit1) - -score_mat <- t(sapply(1:nrow(tmp), - - function(i) { - - logl <- function(x) { - - mm <- model.matrix(~packcat, data=tmp)[i,] - - dlnorm2(as.numeric(tmp[i, "rxtime"]), - - logitp=t(x[1:2]) %*% mm, - - mu=t(x[3:4]) %*% mm, - - lnsigma=t(x[5:6]) %*% mm, - - log=T) - - } - - grad(logl, fit1@coef) - - } - -)) - -colnames(score_mat) <- names(fit1@coef) - -bread <- fit1@vcov - -applyTapplySum <- function(X,index) apply(X, 2, function(col) tapply(col, index, sum)) - -psi <- applyTapplySum(as.matrix(score_mat), tmp$pid) # sum by cluster - -meat <- crossprod(as.matrix(psi)) - -rownames(meat) <- colnames(meat) <- colnames(psi) - -sand <- (NROW(unique(tmp$pid))/(NROW(unique(tmp$pid))-1)) * (bread %*% meat %*% bread) - -sand diff --git a/sandpit/try.R b/sandpit/try.R deleted file mode 100644 index 12b5149..0000000 --- a/sandpit/try.R +++ /dev/null @@ -1,97 +0,0 @@ -rm(list = ls()) - -library(data.table) -library(tidyverse) -library(haven) -library(bbmle) - -# load data -df <- haven::read_dta("data/wtddat_dates.dta") -df <- as.data.table(df) - -# create time variable -df_2 <- create_time(data = df, event.date.colname = rx1time, start = as.Date('2014-01-01')) - - -# fit waiting time distribution -fit1 <- wtdttt(data = df_2, - obstime ~ dlnorm(logitp, mu, lnsigma), - event.date.colname = rx1time, - event.time.colname = obstime, - start = as.Date('2014-01-01'), end = as.Date('2014-12-31'), - init = list(mu = 4.23, lnsigma = -1.26), - ) - -###### ERROR -# if the user provides the init argument, the following error is thrown: -# Error in minuslogl(logitp = 1.65, logitp = 1.61280168302535, mu = 4.23, : -# formal argument "logitp" matched by multiple actual arguments -# -# SOLVED removing lpinit as element of the list argument 'init' -# -# N.B. fit1@fullcoef includes the estimates of three parameters (logitp, mu, lnsigma) if init values are NOT provided by the user -# fit2@fullcoef includes the estimates of two parameters (mu, lnsigma) if init values ARE provided by the user -# -# commented Henrik rows in wtdttt on redefinition of density functions -# run again external dlnorm function in dfunctions.R (with delta=365 in the definition of the function) - -############# - -summary(fit1) - -predict(fit1, family = "lnorm") -predict(fit1, family = "lnorm", type = "prob", distrx=c(2,100)) - -plot(fit1) - -########################### RANDOM INEDX DATE - -# by hand - -# create sampling window and define a random index date for each individual -set.seed(84) - -df_r <- df %>% - mutate(r_index_date = sample(as.Date(as.Date("2014-01-01"):as.Date("2014-12-31")), nrow(df), replace=T), - obstime = as.numeric(rx1time - r_index_date)) - -# remove dispensations not within the observation window -df_sel <- df_r %>% - filter(obstime>=0) - -########## ERROR -# i) if init values provided: -# Error in minuslogl(logitp = 1.76, logitp = 1.76026080216868, mu = 4.23, : -# formal argument "logitp" matched by multiple actual arguments -# -# --> i) SOLVED: revoming lpinit from the init values provided -# -# ii) if not provided: -# Error in optim(par = c(logitp = 1.76026080216868, mu = -Inf, lnsigma = NaN : -# valore non finito passato da optim -########### - -# calling the created function -df_sel <- create_time_random(data = df, event.date.colname = rx1time) - -########### - -# fit waiting time distribution -fit2 <- wtdttt(data = df_sel, - obstime ~ dlnorm(logitp, mu, lnsigma), - event.date.colname = rx1time, - event.time.colname = obstime, - start = as.Date('2014-01-01'), end = as.Date('2014-12-31'), - init = list(mu = 4.23, lnsigma = -1.26) -) - -summary(fit2) - -predict(fit2, family = "lnorm") -predict(fit2, family = "lnorm", type = "prob", distrx=c(2,100)) - -plot(fit2) - -################ - - diff --git a/sandpit/try_sandwich.R b/sandpit/try_sandwich.R deleted file mode 100644 index 19f9767..0000000 --- a/sandpit/try_sandwich.R +++ /dev/null @@ -1,43 +0,0 @@ -# proof of concept sandwich estimation - -library(bbmle) -library(Rwtdttt) -library(haven) -library(data.table) -library(readxl) - -# load data -# test data for sandwich from Henrik -tmp <- haven::read_dta(system.file("extdata", "score_ex.dta", package="Rwtdttt")) - -tmp$packcat <- factor(tmp$packsize) - -cl <- tmp[order(tmp$pid),] - -fit1 <- wtdttt(data = cl, - rxtime ~ dlnorm(logitp, mu, lnsigma), - parameters = list(logitp ~ packcat, mu ~ packcat, lnsigma ~ packcat), - #id = "pid", # do this to stop wtdttt() from filtering the data - start = 0, end = 1 -) - -summary(fit1) - -fit1@idvar <- "pid" # XXXX nasty hack to add the ids back in again for the sandwich - -Rwtdttt:::sand_vcov(fit1) - -# Equivalent Stata output - -# . matrix list Dcl -# -# symmetric Dcl[6,6] -# logitp: logitp: mu: mu: lnsigma: lnsigma: -# ipack _cons ipack _cons ipack _cons -# logitp:ipack .61015454 -# logitp:_cons -.2340662 .2340662 -# mu:ipack .01835992 -.00759435 .04215239 -# mu:_cons -.00759435 .00759435 -.032557 .032557 -# lnsigma:ipack .0696015 -.045143 -.03004152 .02833014 .12762532 -# lnsigma:_cons -.045143 .045143 .02833014 -.02833014 -.07785109 .07785109 - diff --git a/sandpit/wtd_bbmle_try.R b/sandpit/wtd_bbmle_try.R deleted file mode 100644 index 617857c..0000000 --- a/sandpit/wtd_bbmle_try.R +++ /dev/null @@ -1,62 +0,0 @@ -library(tidyverse) -library(bbmle) -library(haven) - - -wtddat <- read_dta("data/wtddat_covar.dta") - -wtddat <- mutate(wtddat, - wtdtimes = 1 - last_rxtime, - packcat = factor(packsize)) - -delta <- 1 - -dwtdttt <- function(x, logitp, par1, par2, log = FALSE, family = "lnorm") { - - if (family=="lnorm") { - prob <- exp(logitp) / (1 + exp(logitp)) - sigma <- exp(par2) - mean <- exp(par1 + exp(2 * par2)/2) - - if (log == FALSE) { - prob * pnorm(log(x), par1, sigma, lower.tail = FALSE) / mean + (1 - prob) / delta - } else { - log(prob * pnorm(log(x), par1, sigma, lower.tail = FALSE) / mean + (1 - prob) / delta) - } - - } else if (family=="weibull") { - prob <- exp(logitp) / (1 + exp(logitp)) - alpha <- exp(par1) - beta <- exp(par2) - - if (log == FALSE) { - prob * exp(-((x*exp(par2))^exp(par1)) - lgamma(1+1/exp(par1))) * exp(par2) + (1-prob)/delta - } else { - log(prob * exp(-((x*exp(par2))^exp(par1)) - lgamma(1+1/exp(par1))) * exp(par2) + (1-prob)/delta) - } - } else if (family=="exponential") { - prob <- exp(logitp) / (1 + exp(logitp)) - alpha <- par1^0 - beta <- exp(par2) - - if (log == FALSE) { - prob * exp(-((x*exp(par2))^alpha) - lgamma(1+1/alpha)) * exp(par2) + (1-prob)/delta - } else { - log(prob * exp(-((x*exp(par2))^alpha) - lgamma(1+1/alpha)) * exp(par2) + (1-prob)/delta) - } - } -} - -## Testing the density function - does it make sense? -wtddat$wtddens <- dwtdttt(wtddat$wtdtimes, - logitp = 1, par1 = log(.15), par2 = -0.4, log = FALSE, family = "exponential") -ggplot(data = wtddat) + - geom_point(mapping = aes(x=wtdtimes, y=wtddens)) - - -## Covariates can now be included (SIC!!!) -fit1 <- mle2(wtdtimes ~ dwtdttt(logitp, par1, par2, family = "exponential"), - parameters = list(logitp ~ packcat, par1 ~ packcat, par2 ~ 1), - start = list(logitp = 0, par1 = log(delta/5), par2 = 0), - data = wtddat) -summary(fit1) diff --git a/sandpit/wtd_class_test.R b/sandpit/wtd_class_test.R deleted file mode 100644 index 0312f69..0000000 --- a/sandpit/wtd_class_test.R +++ /dev/null @@ -1,45 +0,0 @@ -# Rwtdttt -# Combine Henrik, Malcolm & Sabrina's progress so far -# Create a "wtd" model class and use it to set up a 'predict' method - -# Test it - -library(tidyverse) -library(bbmle) -library(haven) - -x <- read_dta("data/wtddat_dates.dta") - -# XXXX a bunch of preprocessing that should actually be -# implemented in wtdttt() - -start <- as.Date('2014-01-01'); end<-as.Date('2014-12-31') -x$obstime <- 0.5 + as.double(x$rx1time - start, units="days") - -delta <- as.double(end - start, units="days") + 1 -ntot <- nrow(x) -nonprevend <- sum(x$obstime > (delta * 2/3)) -prp <- 1 - 3 * nonprevend / ntot -lpinit <- qlogis(prp) - -muinit <- mean(log(x$obstime[x$obstime < 0.5 * delta])) -lnsinit <- log(sd(log(x$obstime[x$obstime < 0.5 * delta]))) - -# ############### - -x.w <- wtdttt(obstime ~ dlnorm(logitp, mu, lnsigma), data=x, - start=as.Date('2014-01-01'), end=as.Date('2014-12-31'), reverse=F, - init=list(logitp=lpinit, mu = muinit, lnsigma = lnsinit)) - -summary(x.w) - -# Predict duration at default quantile=0.8 -predict(x.w) - -# Predict probability -predict(x.w, type="prob", distrx=c(10,100)) - -# Diagnostic plot -plot(x.w) - - diff --git a/sandpit/wtdttt.R b/sandpit/wtdttt.R deleted file mode 100644 index 53a677f..0000000 --- a/sandpit/wtdttt.R +++ /dev/null @@ -1,50 +0,0 @@ -if (!require("zoo")) install.packages("zoo") -library(zoo) -if (!require("haven")) install.packages("haven") -library(haven) -if (!require("tidyverse")) install.packages("tidyverse") -library(tidyverse) - -wtdttt <- function(time, start, end) { - - # continuity correction for discrete data - tstart <- as.numeric(start) - 0.5 - tend <- as.numeric(end) + 0.5 - delta <- tend - tstart - - # if(reverse==TRUE) { - # obstime <- as.numeric(as.Date(tend)-as.Date(time)) } - # else { - # obstime <- as.numeric(as.Date(time)-as.Date(tstart)) - # } - - time <- as.numeric(time) - obstime <- as.numeric(as.Date(time)-as.Date(tstart)) - - # initial values - ntot <- length(time) - nonprevend <- sum(obstime > (delta*2/3)) - prp <- 1 - 3*nonprevend/ntot - lpinit <- log((prp)/(1-prp)) # logit - - # df <- df %>% - # mutate(logtime = case_when( - # obstime