diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/CITATION b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/CITATION index f4e73712e..61386be8b 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/CITATION +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/CITATION @@ -3,11 +3,11 @@ citHeader("To cite StanHeaders in publications use:") citEntry(entry = "Misc", title = "{StanHeaders}: Headers for the {R} interface to {Stan}", author = person(given = "Stan Development Team"), - note = "R package version 2.19.0", - year = "2019", + note = "R package version 2.18.0", + year = "2018", url = "http://mc-stan.org/", textVersion = - paste("Stan Development Team (2019).", + paste("Stan Development Team (2018).", title = "StanHeaders: Headers for the {R} interface to {Stan}.", "http://mc-stan.org/.") ) diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/DESCRIPTION b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/DESCRIPTION index fd0690bbc..9f5d4528a 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/DESCRIPTION +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/DESCRIPTION @@ -1,5 +1,5 @@ Package: StanHeaders -Date: 2019-09-05 +Date: 2020-06-09 Title: C++ Header Files for Stan Authors@R: c(person("Ben",family="Goodrich", email="benjamin.goodrich@columbia.edu", role=c('cre','aut')), person("Joshua", "Pritikin", role = "ctb"), @@ -29,16 +29,18 @@ Authors@R: c(person("Ben",family="Goodrich", email="benjamin.goodrich@columbia.e person("The Regents of the", "University of California", role = "cph", comment = "CVODES"), person("Southern Methodist", "University", role = "cph", comment = "CVODES")) URL: http://mc-stan.org/ -Description: The C++ header files of the Stan project are provided by this package, but it contains no R code or function documentation. There is a shared object containing part of the 'CVODES' library, but it is not accessible from R. 'StanHeaders' is only useful for developers who want to utilize the 'LinkingTo' directive of their package's DESCRIPTION file to build on the Stan library without incurring unnecessary dependencies. The Stan project develops a probabilistic programming language that implements full or approximate Bayesian statistical inference via Markov Chain Monte Carlo or 'variational' methods and implements (optionally penalized) maximum likelihood estimation via optimization. The Stan library includes an advanced automatic differentiation scheme, 'templated' statistical and linear algebra functions that can handle the automatically 'differentiable' scalar types (and doubles, 'ints', etc.), and a parser for the Stan language. The 'rstan' package provides user-facing R functions to parse, compile, test, estimate, and analyze Stan models. -Suggests: Rcpp, RcppEigen, BH, knitr (>= 1.15.1), rmarkdown, Matrix, - methods, rstan +Description: The C++ header files of the Stan project are provided by this package, but it contains little R code or documentation. The main reference is the vignette. There is a shared object containing part of the 'CVODES' library, but its functionality is not accessible from R. 'StanHeaders' is only useful for developers who want to utilize the 'LinkingTo' directive of their package's DESCRIPTION file to build on the Stan library without incurring unnecessary dependencies. The Stan project develops a probabilistic programming language that implements full or approximate Bayesian statistical inference via Markov Chain Monte Carlo or 'variational' methods and implements (optionally penalized) maximum likelihood estimation via optimization. The Stan library includes an advanced automatic differentiation scheme, 'templated' statistical and linear algebra functions that can handle the automatically 'differentiable' scalar types (and doubles, 'ints', etc.), and a parser for the Stan language. The 'rstan' package provides user-facing R functions to parse, compile, test, estimate, and analyze Stan models. +Imports: RcppParallel (>= 5.0.1) +Suggests: Rcpp, BH, knitr (>= 1.15.1), rmarkdown, Matrix, methods, + rstan +LinkingTo: RcppEigen, RcppParallel (>= 5.0.1) VignetteBuilder: knitr SystemRequirements: pandoc Depends: R (>= 3.4.0) -Version: 2.19.0 +Version: 2.21.0-5 License: BSD_3_clause + file LICENSE NeedsCompilation: yes -Packaged: 2019-09-05 18:28:20 UTC; ben +Packaged: 2020-06-09 07:05:55 UTC; ben Author: Ben Goodrich [cre, aut], Joshua Pritikin [ctb], Andrew Gelman [aut], @@ -68,5 +70,5 @@ Author: Ben Goodrich [cre, aut], Southern Methodist University [cph] (CVODES) Maintainer: Ben Goodrich Repository: CRAN -Date/Publication: 2019-09-07 07:00:17 UTC -Built: R 3.6.0; x86_64-apple-darwin15.6.0; 2019-09-08 13:01:16 UTC; unix +Date/Publication: 2020-06-09 14:30:10 UTC +Built: R 3.6.1; x86_64-apple-darwin15.6.0; 2020-06-26 11:59:42 UTC; unix diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/INDEX b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/INDEX new file mode 100644 index 000000000..bc52e8c52 --- /dev/null +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/INDEX @@ -0,0 +1 @@ +CxxFlags Compilation flags for StanHeaders diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/Rd.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/Rd.rds index 6461be8bf..3ff41e22d 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/Rd.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/Rd.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/features.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/features.rds index 46b41cc5d..195fa1095 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/features.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/features.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/hsearch.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/hsearch.rds index 229ed48fd..9ac2db792 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/hsearch.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/hsearch.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/links.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/links.rds index 2837b0b0f..68d5c8aeb 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/links.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/links.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/nsInfo.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/nsInfo.rds index 45a2a6dc5..c28c2be05 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/nsInfo.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/nsInfo.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/package.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/package.rds index 8fdf53b30..8630c571d 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/package.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/package.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/vignette.rds b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/vignette.rds index 206a54af8..f743a2a4f 100644 Binary files a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/vignette.rds and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/Meta/vignette.rds differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/NAMESPACE b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/NAMESPACE index 71f4b1ce8..d12bde30e 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/NAMESPACE +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/NAMESPACE @@ -1,2 +1,3 @@ if(.Platform$OS.type == "windows") useDynLib(StanHeaders, .registration = TRUE) - +importFrom(RcppParallel, RcppParallelLibs) +#export(CxxFlags, LdFlags) diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders new file mode 100644 index 000000000..3b65e3cbb --- /dev/null +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders @@ -0,0 +1,27 @@ +# File share/R/nspackloader.R +# Part of the R package, http://www.R-project.org +# +# Copyright (C) 1995-2012 The R Core Team +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# A copy of the GNU General Public License is available at +# http://www.r-project.org/Licenses/ + +local({ + info <- loadingNamespaceInfo() + pkg <- info$pkgname + ns <- .getNamespace(as.name(pkg)) + if (is.null(ns)) + stop("cannot find namespace environment for ", pkg, domain = NA); + dbbase <- file.path(info$libname, pkg, "R", pkg) + lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.") +}) diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdb b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdb new file mode 100644 index 000000000..85650007e Binary files /dev/null and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdb differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdx b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdx new file mode 100644 index 000000000..479f7ae57 Binary files /dev/null and b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/R/StanHeaders.rdx differ diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.R b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.R index 9302ede32..1a07ac0ca 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.R +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.R @@ -1,4 +1,4 @@ -## ----setup, include = FALSE---------------------------------------------- +## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -6,7 +6,15 @@ knitr::opts_chunk$set( Sys.setenv(USE_CXX14 = "1") set.seed(12345) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- +Sys.setenv(PKG_CXXFLAGS = StanHeaders:::CxxFlags(as_character = TRUE)) +SH <- system.file(ifelse(.Platform$OS.type == "windows", "libs", "lib"), .Platform$r_arch, + package = "StanHeaders", mustWork = TRUE) +Sys.setenv(PKG_LIBS = paste0(StanHeaders:::LdFlags(as_character = TRUE), + " -L", shQuote(SH), " -lStanHeaders")) + + +## ----------------------------------------------------------------------------- x <- optim(rnorm(3), fn = f, gr = g, a = 1:3, method = "BFGS", hessian = TRUE) x$par x$hessian @@ -14,32 +22,26 @@ H(x$par, a = 1:3) J(x$par, a = 1:3) solution(a = 1:3, guess = rnorm(3)) -## ------------------------------------------------------------------------ -StanHeaders_pkg_libs <- system.file(ifelse(.Platform$OS.type == "windows", "libs", "lib"), - .Platform$r_arch, package = "StanHeaders") -Sys.setenv(PKG_LIBS = paste(paste0("-L", shQuote(StanHeaders_pkg_libs)), "-lStanHeaders")) - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- all.equal(1, Cauchy(rexp(1)), tol = 1e-15) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- A <- matrix(c(0, -1, 1, -runif(1)), nrow = 2, ncol = 2) y0 <- rexp(2) all.equal(nonstiff(A, y0), c(0, 0), tol = 1e-5) all.equal( stiff(A, y0), c(0, 0), tol = 1e-8) -## ------------------------------------------------------------------------ -Sys.setenv(PKG_CXXFLAGS = "-DSTAN_THREADS") +## ----------------------------------------------------------------------------- Sys.setenv(STAN_NUM_THREADS = 2) # specify -1 to use all available cores -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- odd <- seq.int(from = 2^25 - 1, to = 2^26 - 1, by = 2) tail(psapply(n = as.list(odd))) == 1 # check your process manager while this is running -## ---- echo = FALSE, comment = ""----------------------------------------- +## ---- echo = FALSE, comment = ""---------------------------------------------- cat(readLines("sparselm_stan.hpp"), sep = "\n") -## ---- message = FALSE---------------------------------------------------- +## ---- message = FALSE--------------------------------------------------------- library(Rcpp) tf <- tempfile(fileext = "Module.cpp") exposeClass("sparselm_stan", @@ -50,24 +52,24 @@ exposeClass("sparselm_stan", rename = c(log_prob = "log_prob<>", gradient = "gradient<>"), header = c("// [[Rcpp::depends(BH)]]", "// [[Rcpp::depends(RcppEigen)]]", + "// [[Rcpp::depends(RcppParallel)]", "// [[Rcpp::depends(StanHeaders)]]", "// [[Rcpp::plugins(cpp14)]]", paste0("#include <", file.path(getwd(), "sparselm_stan.hpp"), ">")), file = tf, Rfile = FALSE) -Sys.unsetenv("PKG_CXXFLAGS") # don't specify -DSTAN_THREADS if you are not using them -Sys.setenv(PKG_CPPFLAGS = paste0("-I", +Sys.setenv(PKG_CXXFLAGS = paste0(Sys.getenv("PKG_CXXFLAGS"), " -I", system.file("include", "src", package = "StanHeaders", mustWork = TRUE))) sourceCpp(tf) sparselm_stan -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- dd <- data.frame(a = gl(3, 4), b = gl(4, 1, 12)) X <- Matrix::sparse.model.matrix(~ a + b, data = dd) X -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- sm <- new(sparselm_stan, X = X, y = rnorm(nrow(X))) sm$log_prob(c(beta = rnorm(ncol(X)), log_sigma = log(pi))) round(sm$gradient(c(beta = rnorm(ncol(X)), log_sigma = log(pi))), digits = 4) diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.Rmd b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.Rmd index 557580d6f..0b4b43663 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.Rmd +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.Rmd @@ -25,25 +25,27 @@ set.seed(12345) The **StanHeaders** package contains no R functions. To use the Stan Math Library in other packages, it is often sufficient to specify ``` -LinkingTo: StanHeaders (>= 2.19.0) +LinkingTo: StanHeaders (>= 2.21.0), RcppParallel (>= 5.0.1) ``` -in the DESCRIPTION file of another package and +in the DESCRIPTION file of another package and put something like ``` CXX_STD = CXX14 +PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") \ + $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()") +PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") \ + $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()") ``` -in the src/Makevars and src/Makevars.win files. If, in addition, the other package needs to utilize +in the src/Makevars and src/Makevars.win files and put `GNU make` in the `SystemRequirements:` +field of the package's DESCRIPTION file. If, in addition, the other package needs to utilize the MCMC, optimization, variational inference, or parsing facilities of the Stan Library, then it is -also necessary to include the `src` directory of **StanHeaders** in the other package's `PKG_CPPFLAGS` +also necessary to include the `src` directory of **StanHeaders** in the other package's `PKG_CXXFLAGS` in the src/Makevars and src/Makevars.win files with something like ``` STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" \ -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" \ -e "message()" | grep "StanHeaders") -PKG_CPPFLAGS = -I"$(STANHEADERS_SRC)" +PKG_CXXFLAGS += -I"$(STANHEADERS_SRC)" ``` -and put `GNU make` in the `SystemRequirements:` field of the package's DESCRIPTION file. Finally, -if integrating systems of stiff Ordinary Differential Equations, then you need to link to the -shared library of StanHeaders as is shown below. # Using Higher-Order Functions in the **StanHeaders** Package @@ -63,17 +65,27 @@ could reconceptualize the problem as solving a homogeneous system of equations w equal to a vector of zeros. The `stan::math::algebra_solver` function can solve such a system using autodifferentiation to obtain the Jacobian, which we know to be the identity matrix in this case. +```{r} +Sys.setenv(PKG_CXXFLAGS = StanHeaders:::CxxFlags(as_character = TRUE)) +SH <- system.file(ifelse(.Platform$OS.type == "windows", "libs", "lib"), .Platform$r_arch, + package = "StanHeaders", mustWork = TRUE) +Sys.setenv(PKG_LIBS = paste0(StanHeaders:::LdFlags(as_character = TRUE), + " -L", shQuote(SH), " -lStanHeaders")) + +``` + Here is C++ code that does all of the above, except for the part of finding the optimum, which is done using the R function `optim` below. ```{Rcpp} // [[Rcpp::depends(BH)]] // [[Rcpp::depends(RcppEigen)]] +// [[Rcpp::depends(RcppParallel)]] // [[Rcpp::depends(StanHeaders)]] #include // stuff from fwd/ must come first #include // then stuff from mix/ must come next #include // finally pull in everything from rev/ & prim/ #include -#include +#include // do this AFTER including stan/math // [[Rcpp::plugins(cpp14)]] @@ -162,22 +174,15 @@ solution(a = 1:3, guess = rnorm(3)) The Stan Math library can do one-dimensional numerical integration and can solve stiff and non-stiff systems of differential equations, such as the harmonic oscillator example -below. Solving stiff systems utilizes the CVODES package, which is included in **StanHeaders**, -but requires linking against its shared library with the following R code. - -```{r} -StanHeaders_pkg_libs <- system.file(ifelse(.Platform$OS.type == "windows", "libs", "lib"), - .Platform$r_arch, package = "StanHeaders") -Sys.setenv(PKG_LIBS = paste(paste0("-L", shQuote(StanHeaders_pkg_libs)), "-lStanHeaders")) -``` - +below. Solving stiff systems utilizes the CVODES library, which is included in **StanHeaders**. ```{Rcpp} // [[Rcpp::depends(BH)]] // [[Rcpp::depends(RcppEigen)]] +// [[Rcpp::depends(RcppParallel)]] // [[Rcpp::depends(StanHeaders)]] #include // pulls in everything from rev/ and prim/ #include -#include +#include // do this AFTER including stan/math // [[Rcpp::plugins(cpp14)]] @@ -201,11 +206,13 @@ auto nonstiff(Eigen::MatrixXd A, Eigen::VectorXd y0) { using stan::math::to_vector; using stan::math::to_array_1d; std::vector theta; + std::vector times = {1, 2}; auto y = integrate_ode_rk45([&A](auto t, auto y, auto theta, auto x_r, auto x_i, std::ostream *msgs) { return to_array_1d( (A * to_vector(y)).eval() ); - }, to_array_1d(y0), 0, {1, 2}, theta, {}, {}); - return (to_vector(y[0]) - stan::math::matrix_exp(A) * y0).eval(); + }, to_array_1d(y0), 0, times, theta, {}, {}); + Eigen::VectorXd truth = stan::math::matrix_exp(A) * y0; + return (to_vector(y[0]) - truth).eval(); // should be "zero" } // [[Rcpp::export]] @@ -214,11 +221,13 @@ auto stiff(Eigen::MatrixXd A, Eigen::VectorXd y0) { // not actually stiff using stan::math::to_vector; using stan::math::to_array_1d; std::vector theta; + std::vector times = {1, 2}; auto y = integrate_ode_bdf([&A](auto t, auto y, auto theta, auto x_r, auto x_i, std::ostream *msgs) { return to_array_1d( (A * to_vector(y)).eval() ); - }, to_array_1d(y0), 0, {1, 2}, theta, {}, {}); - return (to_vector(y[0]) - stan::math::matrix_exp(A) * y0).eval(); + }, to_array_1d(y0), 0, times, theta, {}, {}); + Eigen::VectorXd truth = stan::math::matrix_exp(A) * y0; + return (to_vector(y[0]) - truth).eval(); // should be "zero" } ``` @@ -262,7 +271,6 @@ function. However, `map_rect` can also be executed in parallel by defining the p directive `STAN_THREADS` and then setting the `STAN_NUM_THREADS` environmental variable to be the number of threads to use, as in ```{r} -Sys.setenv(PKG_CXXFLAGS = "-DSTAN_THREADS") Sys.setenv(STAN_NUM_THREADS = 2) # specify -1 to use all available cores ``` @@ -271,10 +279,11 @@ and running it in parallel via `map_rect`. ```{Rcpp} // [[Rcpp::depends(BH)]] // [[Rcpp::depends(RcppEigen)]] +// [[Rcpp::depends(RcppParallel)]] // [[Rcpp::depends(StanHeaders)]] #include // pulls in everything from rev/ and prim/ #include -#include +#include // do this AFTER including stan/math // [[Rcpp::plugins(cpp14)]] @@ -360,13 +369,13 @@ exposeClass("sparselm_stan", rename = c(log_prob = "log_prob<>", gradient = "gradient<>"), header = c("// [[Rcpp::depends(BH)]]", "// [[Rcpp::depends(RcppEigen)]]", + "// [[Rcpp::depends(RcppParallel)]", "// [[Rcpp::depends(StanHeaders)]]", "// [[Rcpp::plugins(cpp14)]]", paste0("#include <", file.path(getwd(), "sparselm_stan.hpp"), ">")), file = tf, Rfile = FALSE) -Sys.unsetenv("PKG_CXXFLAGS") # don't specify -DSTAN_THREADS if you are not using them -Sys.setenv(PKG_CPPFLAGS = paste0("-I", +Sys.setenv(PKG_CXXFLAGS = paste0(Sys.getenv("PKG_CXXFLAGS"), " -I", system.file("include", "src", package = "StanHeaders", mustWork = TRUE))) sourceCpp(tf) diff --git a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.html b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.html index 3fe786927..3f9279112 100644 --- a/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.html +++ b/Frameworks/R.framework/Versions/3.6/Resources/library/StanHeaders/doc/stanmath.html @@ -1,25 +1,25 @@ - + - + - + - + Using the Stan Math C++ Library - + @@ -96,9 +118,6 @@ font-size: 14px; line-height: 1.35; } -#header { -text-align: center; -} #TOC { clear: both; margin: 0 0 10px 10px; @@ -266,9 +285,13 @@ code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } -code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } +code > span.fu { color: #900; font-weight: bold; } +code > span.er { color: #a61717; background-color: #e3d2d2; } + + + @@ -277,23 +300,26 @@

Using the Stan Math C++ Library

-

Stan Development Team

-

2019-09-05

+

Stan Development Team

+

2020-06-09

Using the StanHeaders Package from Other R Packages

The StanHeaders package contains no R functions. To use the Stan Math Library in other packages, it is often sufficient to specify

-
LinkingTo: StanHeaders (>= 2.19.0)
-

in the DESCRIPTION file of another package and

-
CXX_STD = CXX14
-

in the src/Makevars and src/Makevars.win files. If, in addition, the other package needs to utilize the MCMC, optimization, variational inference, or parsing facilities of the Stan Library, then it is also necessary to include the src directory of StanHeaders in the other package’s PKG_CPPFLAGS in the src/Makevars and src/Makevars.win files with something like

+
LinkingTo: StanHeaders (>= 2.21.0), RcppParallel (>= 5.0.1)
+

in the DESCRIPTION file of another package and put something like

+
CXX_STD = CXX14
+PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") \
+               $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()")
+PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") \
+           $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()")
+

in the src/Makevars and src/Makevars.win files and put GNU make in the SystemRequirements: field of the package’s DESCRIPTION file. If, in addition, the other package needs to utilize the MCMC, optimization, variational inference, or parsing facilities of the Stan Library, then it is also necessary to include the src directory of StanHeaders in the other package’s PKG_CXXFLAGS in the src/Makevars and src/Makevars.win files with something like

STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" \
   -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" \
   -e "message()" | grep "StanHeaders")
-PKG_CPPFLAGS = -I"$(STANHEADERS_SRC)"
-

and put GNU make in the SystemRequirements: field of the package’s DESCRIPTION file. Finally, if integrating systems of stiff Ordinary Differential Equations, then you need to link to the shared library of StanHeaders as is shown below.

+PKG_CXXFLAGS += -I"$(STANHEADERS_SRC)"

Using Higher-Order Functions in the StanHeaders Package

@@ -301,161 +327,170 @@

Using Higher-Order Functions in the StanHeaders Package

Derivatives and Minimization

The following is a toy example of using the Stan Math library via Rcpp::sourceCpp: to minimize the function \[\left(\mathbf{x} - \mathbf{a}\right)^\top \left(\mathbf{x} - \mathbf{a}\right)\] which has a global minimum when \(\mathbf{x} = \mathbf{a}\). To find this minimum with autodifferentiation, we need to define the objective function. Then, its gradient with respect to \(\mathbf{x}\), which we know is \(2\left(\mathbf{x} - \mathbf{a}\right)\) in this case, can be calculated by autodifferentiation. At the optimum (or on the way to the optimum), we might want to evaluate the Hessian matrix, which we know is \(2\mathbf{I}\), but would need an additional function to evaluate it via autodifferentiation. Finally, one could reconceptualize the problem as solving a homogeneous system of equations where the gradient is set equal to a vector of zeros. The stan::math::algebra_solver function can solve such a system using autodifferentiation to obtain the Jacobian, which we know to be the identity matrix in this case.

+

Here is C++ code that does all of the above, except for the part of finding the optimum, which is done using the R function optim below.

-
// [[Rcpp::depends(BH)]]
-// [[Rcpp::depends(RcppEigen)]]
-// [[Rcpp::depends(StanHeaders)]]
-#include <stan/math/fwd/mat/fun/dot_self.hpp>    // stuff from fwd/ must come first
-#include <stan/math/mix/mat/functor/hessian.hpp> // then stuff from mix/ must come next
-#include <stan/math.hpp>                         // finally pull in everything from rev/ & prim/
-#include <Rcpp.h>
-#include <RcppEigen.h>
-
-// [[Rcpp::plugins(cpp14)]]
-
-/* Objective function */
-
-// [[Rcpp::export]]
-auto f(Eigen::VectorXd x, Eigen::VectorXd a) { // objective function in doubles
-  using stan::math::dot_self;                  // dot_self() is a dot product with self
-  return dot_self( (x - a).eval() );           // .eval() yields a Eigen::VectorXd
-}
-
-/* Gradient */
-
-// [[Rcpp::export]]
-auto g(Eigen::VectorXd x, Eigen::VectorXd a) {  // gradient by AD using Stan
-  double fx;
-  Eigen::VectorXd grad_fx;
-  using stan::math::dot_self;
-  stan::math::gradient([&a](auto x) { return dot_self( (x - a).eval() ); },
-                       x, fx, grad_fx);
-  return grad_fx;
-}
-
-/* Hessian */
-
-// [[Rcpp::export]]
-auto H(Eigen::VectorXd x, Eigen::VectorXd a) { // Hessian by AD using Stan
-  double fx;
-  Eigen::VectorXd grad_fx;
-  Eigen::MatrixXd H;
-  using stan::math::dot_self;
-  using stan::math::subtract; // necessary to get the type promotion correct
-  stan::math::hessian([&a](auto x) { return dot_self(subtract(x, a)); },
-                      x, fx, grad_fx, H);
-  return H;
-}
-
-/* Jacobian */
-
-// [[Rcpp::export]]
-auto J(Eigen::VectorXd x, Eigen::VectorXd a) { // not actually used
-  Eigen::VectorXd fx;
-  Eigen::MatrixXd J;
-  using stan::math::dot_self;
-  stan::math::jacobian([&a](auto x) {
-    return (2 * (x - a)).eval();
-  }, x, fx, J);
-  return J;
-}
-
-struct equations_functor {
-  template <typename T0, typename T1>
-  inline Eigen::Matrix<T0, Eigen::Dynamic, 1>
-  operator()(const Eigen::Matrix<T0, Eigen::Dynamic, 1>& x,
-             const Eigen::Matrix<T1, Eigen::Dynamic, 1>& theta,
-             const std::vector<double>& x_r, const std::vector<int>& x_i,
-             std::ostream* pstream__) const {
-    return 2 * (x - stan::math::to_vector(x_r)).eval();
-  }
-};
-
-// [[Rcpp::export]]
-auto solution(Eigen::VectorXd a, Eigen::VectorXd guess) {
-  Eigen::VectorXd theta;
-  auto x_r = stan::math::to_array_1d(a);
-  equations_functor f;
-  auto x = stan::math::algebra_solver(f, guess, theta, x_r, {});
-  return x;
-}
+
// [[Rcpp::depends(BH)]]
+// [[Rcpp::depends(RcppEigen)]]
+// [[Rcpp::depends(RcppParallel)]]
+// [[Rcpp::depends(StanHeaders)]]
+#include <stan/math/fwd/mat/fun/dot_self.hpp>    // stuff from fwd/ must come first
+#include <stan/math/mix/mat/functor/hessian.hpp> // then stuff from mix/ must come next
+#include <stan/math.hpp>                         // finally pull in everything from rev/ & prim/
+#include <Rcpp.h>
+#include <RcppEigen.h>                           // do this AFTER including stan/math
+
+// [[Rcpp::plugins(cpp14)]]
+
+/* Objective function */
+
+// [[Rcpp::export]]
+auto f(Eigen::VectorXd x, Eigen::VectorXd a) { // objective function in doubles
+  using stan::math::dot_self;                  // dot_self() is a dot product with self
+  return dot_self( (x - a).eval() );           // .eval() yields a Eigen::VectorXd
+}
+
+/* Gradient */
+
+// [[Rcpp::export]]
+auto g(Eigen::VectorXd x, Eigen::VectorXd a) {  // gradient by AD using Stan
+  double fx;
+  Eigen::VectorXd grad_fx;
+  using stan::math::dot_self;
+  stan::math::gradient([&a](auto x) { return dot_self( (x - a).eval() ); },
+                       x, fx, grad_fx);
+  return grad_fx;
+}
+
+/* Hessian */
+
+// [[Rcpp::export]]
+auto H(Eigen::VectorXd x, Eigen::VectorXd a) { // Hessian by AD using Stan
+  double fx;
+  Eigen::VectorXd grad_fx;
+  Eigen::MatrixXd H;
+  using stan::math::dot_self;
+  using stan::math::subtract; // necessary to get the type promotion correct
+  stan::math::hessian([&a](auto x) { return dot_self(subtract(x, a)); },
+                      x, fx, grad_fx, H);
+  return H;
+}
+
+/* Jacobian */
+
+// [[Rcpp::export]]
+auto J(Eigen::VectorXd x, Eigen::VectorXd a) { // not actually used
+  Eigen::VectorXd fx;
+  Eigen::MatrixXd J;
+  using stan::math::dot_self;
+  stan::math::jacobian([&a](auto x) {
+    return (2 * (x - a)).eval();
+  }, x, fx, J);
+  return J;
+}
+
+struct equations_functor {
+  template <typename T0, typename T1>
+  inline Eigen::Matrix<T0, Eigen::Dynamic, 1>
+  operator()(const Eigen::Matrix<T0, Eigen::Dynamic, 1>& x,
+             const Eigen::Matrix<T1, Eigen::Dynamic, 1>& theta,
+             const std::vector<double>& x_r, const std::vector<int>& x_i,
+             std::ostream* pstream__) const {
+    return 2 * (x - stan::math::to_vector(x_r)).eval();
+  }
+};
+
+// [[Rcpp::export]]
+auto solution(Eigen::VectorXd a, Eigen::VectorXd guess) {
+  Eigen::VectorXd theta;
+  auto x_r = stan::math::to_array_1d(a);
+  equations_functor f;
+  auto x = stan::math::algebra_solver(f, guess, theta, x_r, {});
+  return x;
+}

In this compiled RMarkdown document, the knitr package has exported functions f, g, H, J and solution (but not equations_functor) to R’s global environment using the sourceCpp function in the Rcpp package, so that they can now be called from R. Here we find the optimum starting from a random point in three dimensions:

- +

Integrals and Ordinary Differential Equations

-

The Stan Math library can do one-dimensional numerical integration and can solve stiff and non-stiff systems of differential equations, such as the harmonic oscillator example below. Solving stiff systems utilizes the CVODES package, which is included in StanHeaders, but requires linking against its shared library with the following R code.

- +

The Stan Math library can do one-dimensional numerical integration and can solve stiff and non-stiff systems of differential equations, such as the harmonic oscillator example below. Solving stiff systems utilizes the CVODES library, which is included in StanHeaders.

// [[Rcpp::depends(BH)]]
 // [[Rcpp::depends(RcppEigen)]]
-// [[Rcpp::depends(StanHeaders)]]
-#include <stan/math.hpp>                         // pulls in everything from rev/ and prim/
-#include <Rcpp.h>
-#include <RcppEigen.h>
-
-// [[Rcpp::plugins(cpp14)]]
-
-/* Definite integrals */
-
-// [[Rcpp::export]]
-double Cauchy(double scale) {
-  std::vector<double> theta;
-  auto half = stan::math::integrate_1d([](auto x, auto xc, auto theta,
-                                          auto x_r, auto x_i, auto msgs) {
-    return exp(stan::math::cauchy_lpdf(x, 0, x_r[0]));
-  }, -scale, scale, theta, {scale}, {}, Rcpp::Rcout, 1e-7);
-  return half * 2; // should equal 1 for any positive scale
-}
-
-/* Ordinary Differential Equations */
-
-// [[Rcpp::export]]
-auto nonstiff(Eigen::MatrixXd A, Eigen::VectorXd y0) {
-  using stan::math::integrate_ode_rk45;
-  using stan::math::to_vector;
-  using stan::math::to_array_1d;
-  std::vector<double> theta;
-  auto y = integrate_ode_rk45([&A](auto t, auto y, 
-                                   auto theta, auto x_r, auto x_i, std::ostream *msgs) {
-    return to_array_1d( (A * to_vector(y)).eval() );
-  }, to_array_1d(y0), 0, {1, 2}, theta, {}, {});
-  return (to_vector(y[0]) - stan::math::matrix_exp(A) * y0).eval();
-}
-
-// [[Rcpp::export]]
-auto stiff(Eigen::MatrixXd A, Eigen::VectorXd y0) { // not actually stiff
-  using stan::math::integrate_ode_bdf;              // but use the stiff solver anyways
-  using stan::math::to_vector;
-  using stan::math::to_array_1d;
-  std::vector<double> theta;
-  auto y = integrate_ode_bdf([&A](auto t, auto y, 
-                                  auto theta, auto x_r, auto x_i, std::ostream *msgs) {
-    return to_array_1d( (A * to_vector(y)).eval() );
-  }, to_array_1d(y0), 0, {1, 2}, theta, {}, {});
-  return (to_vector(y[0]) - stan::math::matrix_exp(A) * y0).eval();
-}
+// [[Rcpp::depends(RcppParallel)]] +// [[Rcpp::depends(StanHeaders)]] +#include <stan/math.hpp> // pulls in everything from rev/ and prim/ +#include <Rcpp.h> +#include <RcppEigen.h> // do this AFTER including stan/math + +// [[Rcpp::plugins(cpp14)]] + +/* Definite integrals */ + +// [[Rcpp::export]] +double Cauchy(double scale) { + std::vector<double> theta; + auto half = stan::math::integrate_1d([](auto x, auto xc, auto theta, + auto x_r, auto x_i, auto msgs) { + return exp(stan::math::cauchy_lpdf(x, 0, x_r[0])); + }, -scale, scale, theta, {scale}, {}, Rcpp::Rcout, 1e-7); + return half * 2; // should equal 1 for any positive scale +} + +/* Ordinary Differential Equations */ + +// [[Rcpp::export]] +auto nonstiff(Eigen::MatrixXd A, Eigen::VectorXd y0) { + using stan::math::integrate_ode_rk45; + using stan::math::to_vector; + using stan::math::to_array_1d; + std::vector<double> theta; + std::vector<double> times = {1, 2}; + auto y = integrate_ode_rk45([&A](auto t, auto y, + auto theta, auto x_r, auto x_i, std::ostream *msgs) { + return to_array_1d( (A * to_vector(y)).eval() ); + }, to_array_1d(y0), 0, times, theta, {}, {}); + Eigen::VectorXd truth = stan::math::matrix_exp(A) * y0; + return (to_vector(y[0]) - truth).eval(); // should be "zero" +} + +// [[Rcpp::export]] +auto stiff(Eigen::MatrixXd A, Eigen::VectorXd y0) { // not actually stiff + using stan::math::integrate_ode_bdf; // but use the stiff solver anyways + using stan::math::to_vector; + using stan::math::to_array_1d; + std::vector<double> theta; + std::vector<double> times = {1, 2}; + auto y = integrate_ode_bdf([&A](auto t, auto y, + auto theta, auto x_r, auto x_i, std::ostream *msgs) { + return to_array_1d( (A * to_vector(y)).eval() ); + }, to_array_1d(y0), 0, times, theta, {}, {}); + Eigen::VectorXd truth = stan::math::matrix_exp(A) * y0; + return (to_vector(y[0]) - truth).eval(); // should be "zero" +}

Again, in this compiled RMarkdown document, the knitr package has exported the Cauchy, nonstiff and stiff functions to R’s global environment using the sourceCpp function in the Rcpp package so that they can be called from R.

First, we numerically integrate the Cauchy PDF over its interquartile range — which has an area of \(\frac{1}{2}\) — that we then double to verify that it is almost within machine precision of \(1\).

all.equal(1, Cauchy(rexp(1)), tol = 1e-15)
@@ -472,57 +507,57 @@ 

Integrals and Ordinary Differential Equations

Map and Parellelization

The Stan Math Library includes the map_rect function, which applies a function to each element of rectangular arrays and returns a vector, making it a bit like a restricted version of R’s sapply function. However, map_rect can also be executed in parallel by defining the pre-processor directive STAN_THREADS and then setting the STAN_NUM_THREADS environmental variable to be the number of threads to use, as in

- +

Below is C++ code to test whether an integer is prime, using a rather brute-force algorithm and running it in parallel via map_rect.

+// [[Rcpp::depends(RcppParallel)]] +// [[Rcpp::depends(StanHeaders)]] +#include <stan/math.hpp> // pulls in everything from rev/ and prim/ +#include <Rcpp.h> +#include <RcppEigen.h> // do this AFTER including stan/math + +// [[Rcpp::plugins(cpp14)]] + +// see https://en.wikipedia.org/wiki/Primality_test#Pseudocode +struct is_prime { + is_prime() {} + template <typename T1, typename T2> + auto + operator()(const Eigen::Matrix<T1, Eigen::Dynamic, 1>& eta, + const Eigen::Matrix<T2, Eigen::Dynamic, 1>& theta, + const std::vector<double>& x_r, const std::vector<int>& x_i, + std::ostream* msgs = 0) const { + Eigen::VectorXd res(1); // can only return double or var vectors + int n = x_i[0]; + if (n <= 3) { + res.coeffRef(0) = n > 1; + return res; + } else if ( (n % 2 == 0) || (n % 3 == 0) ) { + res.coeffRef(0) = false; + return res; + } + int i = 5; + while (i * i <= n) { + if ( (n % i == 0) || (n % (i + 2) == 0) ) { + res.coeffRef(0) = false; + return res; + } + i += 6; + } + res.coeffRef(0) = true; + return res; + } +}; + +/* parallelization */ +// [[Rcpp::export]] +auto psapply(std::vector<std::vector<int> > n) { + std::vector<Eigen::VectorXd> eta(n.size()); // these all have to be the same size + Eigen::VectorXd theta; + std::vector<std::vector<double> > x_d(n.size()); + return stan::math::map_rect<0, is_prime>(theta, eta, x_d, n, &Rcpp::Rcout); +}

Since the signature for n is a std::vector<std::vector<int> >, we have to pass it from R as a list (which is converted to the outer std::vector<>) of integer vectors (which is converted to the inner std::vector<int>) that happen to be of size one in this case.

odd <- seq.int(from = 2^25 - 1, to = 2^26 - 1, by = 2)
 tail(psapply(n = as.list(odd))) == 1 # check your process manager while this is running
@@ -606,18 +641,18 @@ 

Defining a Stan Model in C++

rename = c(log_prob = "log_prob<>", gradient = "gradient<>"), header = c("// [[Rcpp::depends(BH)]]", "// [[Rcpp::depends(RcppEigen)]]", - "// [[Rcpp::depends(StanHeaders)]]", - "// [[Rcpp::plugins(cpp14)]]", - paste0("#include <", file.path(getwd(), "sparselm_stan.hpp"), ">")), - file = tf, - Rfile = FALSE) -Sys.unsetenv("PKG_CXXFLAGS") # don't specify -DSTAN_THREADS if you are not using them -Sys.setenv(PKG_CPPFLAGS = paste0("-I", + "// [[Rcpp::depends(RcppParallel)]", + "// [[Rcpp::depends(StanHeaders)]]", + "// [[Rcpp::plugins(cpp14)]]", + paste0("#include <", file.path(getwd(), "sparselm_stan.hpp"), ">")), + file = tf, + Rfile = FALSE) +Sys.setenv(PKG_CXXFLAGS = paste0(Sys.getenv("PKG_CXXFLAGS"), " -I", system.file("include", "src", package = "StanHeaders", mustWork = TRUE))) sourceCpp(tf) sparselm_stan -#> C++ class 'sparselm_stan' <0x55bd465987a0> +#> C++ class 'sparselm_stan' <0x55d576b50470> #> Constructors: #> sparselm_stan(Eigen::Map<Eigen::SparseMatrix<double, 0, int>, 0, Eigen::Stride<0, 0> >, Eigen::Matrix<double, -1, 1, 0, -1, 1>) #> @@ -658,6 +693,9 @@

Defining a Stan Model in C++

+ + + @@ -269,6 +291,9 @@ code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } + + + @@ -276,9 +301,9 @@ -

MRP in rstanarm

-

Lauren Kennedy and Jonah Gabry

-

2019-10-01

+

MRP with rstanarm

+

Lauren Kennedy and Jonah Gabry

+

2020-02-11

-

Inference about the population is one the main aims of statistical methodology. Multilevel regression and post-stratification (MRP) (Little 1993; Lax and Phillips 2009; Park, Gelman, and Bafumi 2004) has been shown to be an effective method of adjusting the sample so that it is representative of the population for a set of key variables. Recent work has demonstrated the effectiveness of MRP when there are a number of suspected interactions between these variables (Ghitza and Gelman 2013), replicated by Lei, Gelman, and Ghitza (2017). While Ghitza and Gelman (2013) use approximate marginal maximum likelihood estimates; Lei, Gelman, and Ghitza (2017) implement a fully Bayesian approach through Stan.

+

Inference about the population is one the main aims of statistical methodology. Multilevel regression and post-stratification (MRP) (Little 1993; Lax and Phillips 2009; Park, Gelman, and Bafumi 2004) has been shown to be an effective method of adjusting the sample to be more representative of the population for a set of key variables. Recent work has demonstrated the effectiveness of MRP when there are a number of suspected interactions between these variables (Ghitza and Gelman 2013), replicated by Lei, Gelman, and Ghitza (2017). While Ghitza and Gelman (2013) use approximate marginal maximum likelihood estimates; Lei, Gelman, and Ghitza (2017) implement a fully Bayesian approach through Stan.

The rstanarm package allows the user to conduct complicated regression analyses in Stan with the simplicity of standard formula notation in R. The purpose of this vignette is to demonstrate the utility of rstanarm when conducting MRP analyses. We will not delve into the details of conducting logistic regression with rstanarm as this is already covered in other vignettes.

Most of the code for data manipulation and plotting is not shown in the text but is available in the R markdown source code on GitHub.

The Data

-

Three data sets are simulated by the function simulate_mrp_data(), which is defined in the source code for this R markdown document (and printed in the appendix). The first, sample, contains \(n\) observations from the individuals that form our sample (i.e., \(n\) rows). For each individual we have their age (recorded as membership within a specific age bracket), ethnicity, income level (recorded as membership within a specific bracket), and male/gender. Participants were randomly sampled from a state. The outcome variable of interest is a binary variable (MRP can also be used with a continuous variable outcome variable). Oftentimes this is the outcome of a two option fixed choice question (for example McCain’s share of two party vote (Ghitza and Gelman 2013); support for George W Bush, (Park, Gelman, and Bafumi 2004); or support for the death penalty (Shirley and Gelman 2015)).

-

As this is a simple toy example, we will describe the proportion of the population who would choose to adopt a cat over a dog, given the opportunity. We will simulate data using a function that is included in the appendix of this document. This function simulates sample from a much larger population. It returns a list including the sample, population poststratification matrix and the true population preference for cats.

+

Three data sets are simulated by the function simulate_mrp_data(), which is defined in the source code for this R markdown document (and printed in the appendix). The first, sample, contains \(n\) observations from the individuals that form our sample (i.e., \(n\) rows). For each individual we have their age (recorded as membership within a specific age bracket), ethnicity, income level (recorded as membership within a specific bracket), and gender. Participants were randomly sampled from a state.

+

MRP is often used for dichotomous fixed choice questions (e.g., McCain’s share of two party vote (Ghitza and Gelman 2013); support for George W Bush, (Park, Gelman, and Bafumi 2004); or support for the death penalty (Shirley and Gelman 2015)), so we will use a binary variable as the outcome in this vignette. However, MRP can also be used if there are more than two categories or if the outcome is continuous.

+

As this is a simple toy example, we will describe the proportion of the population who would choose to adopt a cat over a dog, given the opportunity. We will simulate data using a function that is included in the appendix of this document. The simulate_mrp_data() function simulates a sample from a much larger population. It returns a list including the sample, population poststratification matrix and the true population preference for cats.

- -

The variables describing the individual (age, ethnicity, income level and gender) will be used to match the sample to the population of interest. To do this we will need to form a post-stratification table, which contains the number of people in each possible combination of post-stratification variable. As we have 4 variables with 2 (male), 7 (age), 3 (ethnicity) and 3 (income) levels, there are 2x7x3x3 different levels. Participants are also selected from a state (50), increasing the number of possible levels to \(6300\).

-

To make inference about the population, we will also need the proportion of the population in each post stratification cell at the population level. We will use this information to update the estimate of our outcome variable from the sample so that is more representative of the population. This is particularly helpful if there is a belief that the sample has some bias (i.e., a greater proportion of females responded than males), and that bias impacts the outcome variable (i.e, women are more likely to adopt a cat than men). For each possible combination of factors, the post-stratification table shows the proportion/number of the population in that cell (rather than the proportion/number in the sample in the cell). Below we read in the poststrat data our simulated data list.

- -

One of the benefits of using a simulated data set for this example is that the actual, population level probability of cat preference is known for each post-stratification cell. In real world data analysis, we don’t have this luxury, but we will use it later in this case study to check the predictions of the model. Details regarding the simulation of this data are available in the appendix.

- +
List of 3
+ $ sample   :'data.frame':  1200 obs. of  7 variables:
+  ..$ cat_pref: num [1:1200] 1 1 1 1 0 0 1 1 1 1 ...
+  ..$ male    : num [1:1200] 0 0 0 0 0 0 0 1 0 0 ...
+  ..$ age     : num [1:1200] 5 6 3 6 1 5 7 6 5 7 ...
+  ..$ eth     : num [1:1200] 3 1 2 1 1 1 3 3 2 3 ...
+  ..$ income  : num [1:1200] 3 1 2 1 3 1 2 1 1 1 ...
+  ..$ state   : num [1:1200] 19 45 21 47 12 38 2 20 11 34 ...
+  ..$ id      : num [1:1200] 1 2 3 4 5 6 7 8 9 10 ...
+ $ poststrat:'data.frame':  6300 obs. of  6 variables:
+  ..$ male  : num [1:6300] 0 0 0 0 0 0 0 0 0 0 ...
+  ..$ eth   : num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ age   : num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ income: num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ state : num [1:6300] 1 2 3 4 5 6 7 8 9 10 ...
+  ..$ N     : num [1:6300] 103741 104862 164704 133049 167578 ...
+ $ true_popn:'data.frame':  6300 obs. of  6 variables:
+  ..$ male    : num [1:6300] 0 0 0 0 0 0 0 0 0 0 ...
+  ..$ eth     : num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ age     : num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ income  : num [1:6300] 1 1 1 1 1 1 1 1 1 1 ...
+  ..$ state   : num [1:6300] 1 2 3 4 5 6 7 8 9 10 ...
+  ..$ cat_pref: num [1:6300] 0.5 0.426 0.269 0.574 0.332 ...
+ +
     cat_pref male age eth income state   id
+1           1    0   5   3      3    19    1
+2           1    0   6   1      1    45    2
+3           1    0   3   2      2    21    3
+4           1    0   6   1      1    47    4
+5           0    0   1   1      3    12    5
+6           0    0   5   1      1    38    6
+1195        0    0   6   3      2    21 1195
+1196        1    0   3   3      1    46 1196
+1197        1    0   5   1      2    48 1197
+1198        0    1   1   1      1    14 1198
+1199        0    0   1   3      1    12 1199
+1200        0    1   3   2      2    12 1200
+

The variables describing the individual (age, ethnicity, income level and gender) will be used to match the sample to the population of interest. To do this we will need to form a post-stratification table, which contains the number of people in each possible combination of the post-stratification variables. We have 4 variables with 2 (male), 7 (age), 3 (ethnicity) and 3 (income) levels, so there are 2x7x3x3 different levels. Participants are also selected from a state (50), increasing the number of possible levels to \(6300\).

+

To make inference about the population, we will also need the proportion of individuals in each post stratification cell at the population level. We will use this information to update the estimate of our outcome variable from the sample so that is more representative of the population. This is particularly helpful if there is a belief that the sample has some bias (e.g., a greater proportion of females responded than males), and that the bias impacts the outcome variable (e.g., maybe women are more likely to pick a cat than men). For each possible combination of factors, the post-stratification table shows the proportion/number of the population in that cell (rather than the proportion/number in the sample in the cell).

+

Below we read in the poststrat data our simulated data list.

+ +
     male eth age income state      N
+1       0   1   1      1     1 103741
+2       0   1   1      1     2 104862
+3       0   1   1      1     3 164704
+4       0   1   1      1     4 133049
+5       0   1   1      1     5 167578
+6       0   1   1      1     6 109814
+6295    1   3   7      3    45  10061
+6296    1   3   7      3    46  13055
+6297    1   3   7      3    47  12578
+6298    1   3   7      3    48  13754
+6299    1   3   7      3    49   9937
+6300    1   3   7      3    50   9646
+

One of the benefits of using a simulated data set for this example is that the actual population level probability of cat preference is known for each post-stratification cell. In real world data analysis, we don’t have this luxury, but we will use it later in this case study to check the predictions of the model. Details regarding the simulation of this data are available in the appendix.

+ +
     male eth age income state  cat_pref
+1       0   1   1      1     1 0.5000000
+2       0   1   1      1     2 0.4255575
+3       0   1   1      1     3 0.2689414
+4       0   1   1      1     4 0.5744425
+5       0   1   1      1     5 0.3318122
+6       0   1   1      1     6 0.6224593
+6295    1   3   7      3    45 0.7502601
+6296    1   3   7      3    46 0.8581489
+6297    1   3   7      3    47 0.9241418
+6298    1   3   7      3    48 0.6456563
+6299    1   3   7      3    49 0.4255575
+6300    1   3   7      3    50 0.9308616

Exploring Graphically

@@ -340,106 +431,225 @@

Exploring Graphically

Comparing sample to population

The aim of this analysis is to obtain a population estimation of cat preference given our sample of \(4626\). We can see in the following plot the difference in proportions between the sample and the population. Horizontal panels represent each variable. Bars represent the proportion of the sample (solid) and population (dashed) in each category (represented by colour and the x-axis). For ease of viewing, we ordered the states in terms of the proportion of the sample in that state that was observed. We will continue this formatting choice thoughout this vignette.

- + +

Effect of the post-stratification variable on preference for cats

Secondly; we consider the evidence of different proportions across different levels of a post-stratification variable; which we should consider for each of the post-stratification variables. Here we break down the proportion of individuals who would prefer a cat (y-axis) by different levels (x-axis) of the post-stratification variable (horizontal panels). We can see from this figure that there appears to be differences in cat preference for the different levels of post-stratification variables. Given the previous figure, which suggested that the sample was different to the population in the share of different levels of theses variables, this should suggest that using the sample to estimate cat preference may not give accurate estimates of cat preference in the population.

+

Interaction effect

Thirdly, we demonstrate visually that there is an interaction between age and gender and compare to a case where there is no interaction. Here a simulated interaction effect between age (x-axis) and gender (color), right panel, is contrasted with no interaction effect (left panel). While both panels demonstrate a difference between the genders on the outcome variable (y-axis), only the second panel shows this difference changing with the variable on the x-axis.

+

Design effect

Lastly we look at the difference in cat preference between states, which will form the basis for the multi-level component of our analysis. Participants were randomly selected from particular states. Plotting the state (x-axis) against the overall proportion of participants who prefer cats (y-axis) demonstrates state differences. The downward slope is because we ordered the x-axis by the proportion of cat preference for ease of viewing. We also include second plot with a horizontal line to represent the overall preference for cats in the total population, according to the sample.

+

-
-

MRP in RStanArm

+
+

MRP with rstanarm

From visual inspection, it appears that different levels of post-stratification variable have different preferences for cats. Our survey also appears to have sampling bias; indicating that some groups were over/under sampled relative to the population. The net effect of this is that we could not make good population level estimates of cat preference straight from our sample. Our aim is to infer the preference for cats in the population using the post-stratification variables to account for systematic differences between the sample and population. Using rstanarm, this becomes a simple procedure.

-

The first step is to use a multi-level generalized logistic regression model to predict preference for cats in the sample given the variables that we wish to post-stratify with. Note that we actually have more rows in the post-stratification matrix than the we have observed units, so there are some cells in the poststrat matrix that we don’t observe. We can use a multi-level model to partially pool information across the different levels within each variable to assist with this. In the model described below, we use a fixed intercept for male, and random intercepts for each of the other factors. This model is included below, with \(\theta_{j}\) representing the preference for cats in the poststratification cell \(j\), and \(X_j\) representing the predictor male

-

\[\theta_j= logit^{-1}(X_{j}\beta)\]

-

The multi level component part of the model fits for each post-stratification variable (other than male) a variable \(\sigma_{S[j]}\) so that we model for each variable an intercept \(\alpha_{S[j]}\) but constrain \(\alpha_{S[j]}\) so that it is distributed \(N(0,\sigma_{S[j]})\). This is beneficial as it means we share information between the levels of each variable, we can prevent little observed levels from being fit too tightly to the observed values, and for those levels that we don’t observe, we can make a good estimate based on the distribution of levels that we do observe. For more benefits of this type of model, see Gelman and others (2005). For simplicity we will use an intercept only model with states, ethnicity, income and age allowed to have different baseline cat preferences, but Ghitza and Gelman (2013) extend this for other possible models.

-

\[\theta_j = logit^{-1}(X_{j}\beta + \alpha_{S[j]}^{S})\]

-

where:

-

\[\alpha_{S[j]}^{S} \sim N(0, \sigma^2_{S[j]})\]

-

In the following code we predict y (preference for cats) using fixed effects for gender and an interaction between age and gender, which allows the relationship between gender and cat preference to differ with age. We chose to use fixed effects for gender because with two levels it is difficult to fit a varying effect. An alternative would be to use a strong prior as in Si et al. (2017). Lastly we include a term that suggests that the intercept might differ by state, ethnicity, age and income. In the appendix a number of different model structures are tabled for ease of use. They are all highly similar to the formulae used by the glmer in the lme4 package.

-

Finally we specify the relationship between the predictors and outcome variable, in this case a logit function. The final element of the input specifies the data frame that contains the variables in the formula and manually sets the adapt_delta value.

- +

The first step is to use a multi-level logistic regression model to predict preference for cats in the sample given the variables that we will use to post-stratify. Note that we actually have more rows in the post-stratification matrix than the we have observed units, so there are some cells in the poststrat matrix that we don’t observe. We can use a multi-level model to partially pool information across the different levels within each variable to assist with this. In the model described below, we use a fixed intercept for gender, and hierarchically modeled varying intercepts for each of the other factors.

+

Let \(\theta_{j}\) denote the preference for cats in the \(j\)th poststratification cell. The non-hierarchical part of the model can be written as

+

\[\theta_j= logit^{-1}(X_{j}\beta),\]

+

where here \(X\) only contains an indicator for male or female and an interaction term with age.

+

Adding the varying intercepts for the other variables the model becomes

+

\[ +\theta_j = logit^{-1}( +X_{j}\beta ++ \alpha_{\rm state[j]}^{\rm state} ++ \alpha_{\rm age[j]}^{\rm age} ++ \alpha_{\rm eth[j]}^{\rm eth} ++ \alpha_{\rm inc[j]}^{\rm inc} +) +\] with

+

$$ \[\begin{align*} +\alpha_{\rm state[j]}^{\rm state} & \sim N(0,\sigma^{\rm state}) \\ +\alpha_{\rm age[j]}^{\rm age} & \sim N(0,\sigma^{\rm age})\\ +\alpha_{\rm eth[j]}^{\rm eth} & \sim N(0,\sigma^{\rm eth})\\ +\alpha_{\rm inc[j]}^{\rm inc} &\sim N(0,\sigma^{\rm inc}) \\ +\end{align*}\]

+

$$

+

Each of \(\sigma^{\rm state}\), \(\sigma^{\rm age}\), \(\sigma^{\rm eth}\), and \(\sigma^{\rm inc}\) are estimated from the data (in this case using rstanarm’s default priors), which is beneficial as it means we share information between the levels of each variable and we can prevent levels with with less data from being too sensitive to the few observed values. This also helps with the levels we don’t observe at all it will use information from the levels that we do observe. For more on the benefits of this type of model, see Gelman and others (2005), and see Ghitza and Gelman (2013) and Si et al. (2017) for more complicated extensions that involve deep interactions and structured prior distributions.

+

Here is the model specified using the stan_glmer() function in rstanarm, which uses the same formula syntax as the glmer() function from the lme4 package:

+ + +
stan_glmer
+ family:       binomial [logit]
+ formula:      cat_pref ~ factor(male) + factor(male) * factor(age) + (1 | state) + 
+       (1 | age) + (1 | eth) + (1 | income)
+ observations: 1200
+------
+                           Median MAD_SD
+(Intercept)                 0.9    0.8  
+factor(male)1              -0.5    0.5  
+factor(age)2               -0.2    0.8  
+factor(age)3               -0.5    0.6  
+factor(age)4                0.6    0.6  
+factor(age)5                0.3    0.6  
+factor(age)6                1.0    0.6  
+factor(age)7                0.8    0.5  
+factor(male)1:factor(age)2  0.3    1.4  
+factor(male)1:factor(age)3 -0.6    0.7  
+factor(male)1:factor(age)4 -1.1    0.6  
+factor(male)1:factor(age)5 -0.8    0.6  
+factor(male)1:factor(age)6 -0.4    0.6  
+factor(male)1:factor(age)7 -1.0    0.5  
+
+Error terms:
+ Groups Name        Std.Dev.
+ state  (Intercept) 1.11    
+ age    (Intercept) 0.69    
+ eth    (Intercept) 0.90    
+ income (Intercept) 0.77    
+Num. levels: state 50, age 7, eth 3, income 3 
+
+------
+* For help interpreting the printed output see ?print.stanreg
+* For info on the priors used see ?prior_summary.stanreg

As a first pass to check whether the model is performing well, note that there are no warnings about divergences, failure to converge or tree depth. If these errors do occur, more information on how to alleviate them is provided here.

Population Estimate

-

From this we get a summary of the baseline log odds of cat preference at the first element of each factor (i.e., male = 0, age = 1) for each state, plus estimates on variability of the intercept for state, ethnicity, age and income. Whilst this is interesting, currently all we have achieved is a model that predicts cat preference given a number of factor-type predictors in a sample. What we would like to do is estimate cat preference in the population by accounting for differences between our sample and the population. We use the posterior_linpred() function to obtain posterior estimates for cat preference given the proportion of people in the population in each level of the factors included in the model.

- +

From this we get a summary of the baseline log odds of cat preference at the first element of each factor (i.e., male = 0, age = 1) for each state, plus estimates on variability of the intercept for state, ethnicity, age and income. While this is interesting, currently all we have achieved is a model that predicts cat preference given a number of factor-type predictors in a sample. What we would like to do is estimate cat preference in the population by accounting for differences between our sample and the population. We use the posterior_linpred() function to obtain posterior estimates for cat preference given the proportion of people in the population in each level of the factors included in the model.

+ +
 mean    sd 
+0.566 0.023 

We can compare this to the estimate we would have made if we had just used the sample:

- + +
[1] 0.681

We can also add it to the last figure to graphically represent the difference between the sample and population estimate.

- + +

As this is simulated data, we can look directly at the preference for cats that we simulated from to consider how good our estimate is.

- -

Which we will also add to the figure.

+ +
[1] 0.561
+

Which we will also add to the figure.

Our MRP estimate is barely off, while our sample estimate is off by more than 10 percentage points. This indicates that using MRP helps to make estimates for the population from our sample that are more accurate.

Estimates for states

One of the nice benefits of using MRP to make inference about the population is that we can change the population of interest. In the previous paragraph we inferred the preference for cats in the whole population. We can also infer the preference for cats in a single state. In the following code we post-stratify for each state in turn. Note that we can reuse the predictive model from the previous step and update for different population demographics. This is particularly useful for complicated cases or large data sets where the model takes some time to fit.

As before, first we use the proportion of the population in each combination of the post-stratification groups to estimate the proportion of people who preferred cats in the population, only in this case the population of interest is the state.

- + +
   State model_state_pref sample_state_pref true_state_pref  N
+1      1           0.5097            0.5000          0.5966  2
+2      2           0.6586            0.8571          0.5315  7
+3      3           0.4442            0.5385          0.3803 13
+4      4           0.7637            0.8846          0.6590 26
+5      5           0.4231            0.5200          0.4439 50
+6      6           0.6824            0.8947          0.6982 19
+7      7           0.7083            0.8250          0.6386 40
+8      8           0.7019            0.9167          0.7850 12
+9      9           0.7166            0.8571          0.6788 14
+10    10           0.6162            0.7727          0.5966 44
+11    11           0.7359            0.8824          0.7850 17
+12    12           0.3855            0.4737          0.4012 57
+13    13           0.7239            1.0000          0.7690  5
+14    14           0.6057            0.6818          0.5966 22
+15    15           0.1998            0.2000          0.1181 15
+16    16           0.3865            0.4706          0.3599 34
+17    17           0.2159            0.1538          0.2169 13
+18    18           0.3473            0.0000          0.2169  2
+19    19           0.4659            0.6000          0.4656  5
+20    20           0.7918            0.8974          0.7850 39
+21    21           0.3819            0.5250          0.3599 40
+22    22           0.7535            0.9167          0.8755 12
+23    23           0.6010            0.7333          0.5751 30
+24    24           0.6303            1.0000          0.7690  2
+25    25           0.7340            0.8261          0.8002 23
+26    26           0.7664            0.8704          0.7524 54
+27    27           0.2551            0.3500          0.2657 20
+28    28           0.5671            0.6667          0.4656  3
+29    29           0.5351            0.6757          0.5095 37
+30    30           0.5202            0.5946          0.5315 37
+31    31           0.5996            0.7143          0.7350  7
+32    32           0.4168            0.5000          0.3399 42
+33    33           0.6088            0.7353          0.6178 68
+34    34           0.5773            0.7200          0.4656 25
+35    35           0.5922            0.7200          0.6590 25
+36    36           0.5315            0.6250          0.5751  8
+37    37           0.6951            0.8571          0.8855  7
+38    38           0.2923            0.3750          0.3205 48
+39    39           0.7240            0.8824          0.8533 17
+40    40           0.3207            0.3684          0.2657 19
+41    41           0.7664            0.9286          0.6982 14
+42    42           0.6728            0.8235          0.5751 34
+43    43           0.5265            0.6667          0.5315  6
+44    44           0.6604            0.8000          0.4439 15
+45    45           0.6595            0.8077          0.5315 26
+46    46           0.6461            0.7750          0.6788 40
+47    47           0.7225            0.8621          0.8002 29
+48    48           0.5100            0.6545          0.4224 55
+49    49           0.2330            0.2500          0.2487  8
+50    50           0.8163            1.0000          0.8146 13
+

Here we similar findings to when we considered the population as whole. While estimates for cat preference (in percent) using the sample are off by

- + +
mean  max 
+  14   36 

the MRP based estimates are much closer to the actual percentage,

- + +
mean  max 
+   6   22 

and especially when the sample size for that population is relatively small. This is easier to see graphically, so we will continue to add additional layers to the previous figure. Here we add model estimates,represented by triangles, and the true population cat preference, represented as transparent circles.

+

@@ -447,11 +657,11 @@

Other formats

Alternate methods of modelling

Previously we used a binary outcome variable. An alternative form of this model is to aggregate the data to the poststrat cell level and model the number of successes (or endorsement of cat preference in this case) out of the total number of people in that cell. To do this we need to create two n x 1 outcome variables, N_cat_pref (number in cell who prefer cats) and N (number in the poststrat cell).

- + @@ -461,29 +671,169 @@

Alternate methods of modelling

We then can use these two outcome variables to model the data using the binomial distribution.

- - + + +
stan_glmer
+ family:       binomial [logit]
+ formula:      cbind(N_cat_pref, N - N_cat_pref) ~ factor(male) + factor(male) * 
+       factor(age) + (1 | state) + (1 | age) + (1 | eth) + (1 | 
+       income)
+ observations: 940
+------
+                           Median MAD_SD
+(Intercept)                 0.9    0.8  
+factor(male)1              -0.5    0.5  
+factor(age)2               -0.2    0.8  
+factor(age)3               -0.5    0.6  
+factor(age)4                0.6    0.6  
+factor(age)5                0.3    0.6  
+factor(age)6                1.0    0.6  
+factor(age)7                0.8    0.6  
+factor(male)1:factor(age)2  0.3    1.4  
+factor(male)1:factor(age)3 -0.6    0.7  
+factor(male)1:factor(age)4 -1.0    0.6  
+factor(male)1:factor(age)5 -0.8    0.6  
+factor(male)1:factor(age)6 -0.4    0.6  
+factor(male)1:factor(age)7 -0.9    0.5  
+
+Error terms:
+ Groups Name        Std.Dev.
+ state  (Intercept) 1.11    
+ age    (Intercept) 0.69    
+ eth    (Intercept) 0.91    
+ income (Intercept) 0.75    
+Num. levels: state 50, age 7, eth 3, income 3 
+
+------
+* For help interpreting the printed output see ?print.stanreg
+* For info on the priors used see ?prior_summary.stanreg

Like before, we can use the posterior_linpred() function to obtain an estimate of the preference for cats in the population.

- + +
 mean    sd 
+0.566 0.024 

As we should, we get the same answer as when we fit the model using the binary outcome. The two ways are equivalent, so we can use whichever form is most convenient for the data at hand. More details on these two forms of binomial models are available here.

-
-

Examples of other formulas

+
+
+

Appendix

+
+

Examples of other formulas

The formulas for fitting so-called “mixed-effects” models in rstanarm are the same as those in the lme4 package. A table of examples can be found in Table 2 of the vignette for the lme4 package, available here.

+
+

Code to simulate the data

+

Here is the source code for the simulate_mrp_function(), which is based off of some code provided by Aki Vehtari.

+ +
function(n) {
+  J <- c(2, 3, 7, 3, 50) # male or not, eth, age, income level, state
+  poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1))) # Columns of post-strat matrix, plus one for size
+  colnames(poststrat) <- c("male", "eth", "age","income", "state",'N')
+  count <- 0
+  for (i1 in 1:J[1]){
+    for (i2 in 1:J[2]){
+      for (i3 in 1:J[3]){
+        for (i4 in 1:J[4]){
+          for (i5 in 1:J[5]){
+              count <- count + 1
+              # Fill them in so we know what category we are referring to
+              poststrat[count, 1:5] <- c(i1-1, i2, i3,i4,i5) 
+          }
+        }
+      }
+    }
+  }
+  # Proportion in each sample in the population
+  p_male <- c(0.52, 0.48)
+  p_eth <- c(0.5, 0.2, 0.3)
+  p_age <- c(0.2,.1,0.2,0.2, 0.10, 0.1, 0.1)
+  p_income<-c(.50,.35,.15)
+  p_state_tmp<-runif(50,10,20)
+  p_state<-p_state_tmp/sum(p_state_tmp)
+  poststrat$N<-0
+  for (j in 1:prod(J)){
+    poststrat$N[j] <- round(250e6 * p_male[poststrat[j,1]+1] * p_eth[poststrat[j,2]] *
+      p_age[poststrat[j,3]]*p_income[poststrat[j,4]]*p_state[poststrat[j,5]]) #Adjust the N to be the number observed in each category in each group
+  }
+  
+  # Now let's adjust for the probability of response
+  p_response_baseline <- 0.01
+  p_response_male <- c(2, 0.8) / 2.8
+  p_response_eth <- c(1, 1.2, 2.5) / 4.7
+  p_response_age <- c(1, 0.4, 1, 1.5,  3, 5, 7) / 18.9
+  p_response_inc <- c(1, 0.9, 0.8) / 2.7
+  p_response_state <- rbeta(50, 1, 1)
+  p_response_state <- p_response_state / sum(p_response_state)
+  p_response <- rep(NA, prod(J))
+  for (j in 1:prod(J)) {
+    p_response[j] <-
+      p_response_baseline * p_response_male[poststrat[j, 1] + 1] *
+      p_response_eth[poststrat[j, 2]] * p_response_age[poststrat[j, 3]] *
+      p_response_inc[poststrat[j, 4]] * p_response_state[poststrat[j, 5]]
+  }
+  people <- sample(prod(J), n, replace = TRUE, prob = poststrat$N * p_response)
+  
+  ## For respondent i, people[i] is that person's poststrat cell,
+  ## some number between 1 and 32
+  n_cell <- rep(NA, prod(J))
+  for (j in 1:prod(J)) {
+    n_cell[j] <- sum(people == j)
+  }
+  
+  coef_male <- c(0,-0.3)
+  coef_eth <- c(0, 0.6, 0.9)
+  coef_age <- c(0,-0.2,-0.3, 0.4, 0.5, 0.7, 0.8, 0.9)
+  coef_income <- c(0,-0.2, 0.6)
+  coef_state <- c(0, round(rnorm(49, 0, 1), 1))
+  coef_age_male <- t(cbind(c(0, .1, .23, .3, .43, .5, .6),
+                           c(0, -.1, -.23, -.5, -.43, -.5, -.6)))
+  true_popn <- data.frame(poststrat[, 1:5], cat_pref = rep(NA, prod(J)))
+  for (j in 1:prod(J)) {
+    true_popn$cat_pref[j] <- plogis(
+      coef_male[poststrat[j, 1] + 1] +
+        coef_eth[poststrat[j, 2]] + coef_age[poststrat[j, 3]] +
+        coef_income[poststrat[j, 4]] + coef_state[poststrat[j, 5]] +
+        coef_age_male[poststrat[j, 1] + 1, poststrat[j, 3]]
+      )
+  }
+  
+  #male or not, eth, age, income level, state, city
+  y <- rbinom(n, 1, true_popn$cat_pref[people])
+  male <- poststrat[people, 1]
+  eth <- poststrat[people, 2]
+  age <- poststrat[people, 3]
+  income <- poststrat[people, 4]
+  state <- poststrat[people, 5]
+  
+  sample <- data.frame(cat_pref = y, 
+                       male, age, eth, income, state, 
+                       id = 1:length(people))
+  
+  #Make all numeric:
+  for (i in 1:ncol(poststrat)) {
+    poststrat[, i] <- as.numeric(poststrat[, i])
+  }
+  for (i in 1:ncol(true_popn)) {
+    true_popn[, i] <- as.numeric(true_popn[, i])
+  }
+  for (i in 1:ncol(sample)) {
+    sample[, i] <- as.numeric(sample[, i])
+  }
+  list(
+    sample = sample,
+    poststrat = poststrat,
+    true_popn = true_popn
+  )
+}
-
-

Appendix: Code to simulate the data

-

References

@@ -518,6 +868,9 @@

References

+ + +