diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.gitignore b/.gitignore
index 45b8993..8506aa6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ _book
 *_cache
 *.log
 site_libs
+!mathjax.html
diff --git a/DESCRIPTION b/DESCRIPTION
index 4d79863..3eca903 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,23 +3,58 @@ Type: Website
 Title: "Analysis of mechanisms"
 Version: 1.0
 URL: https://github.com/steno-aarhus/mediation-analysis-course
-Depends: 
+Depends:
     R (>= 4.0.0),
     tidyverse
 Encoding: UTF-8
 LazyData: true
 License: CC BY 4.0
-Imports: 
+Imports:
     CMAverse (>= 0.1.0),
     DiagrammeR,
     downlit,
     furrr,
     here,
     knitr,
+    quarto,
+    bbotk,
     mediation,
     NetCoupler,
     rmarkdown,
     survival,
-    xml2
-Remotes: 
-    BS1125/CMAverse
+    xml2,
+    magick,
+    pdftools,
+    svglite,
+    qpdf,
+    repr,
+    data.table,
+    mvtnorm,
+    skimr,
+    sessioninfo,
+    checkmate,
+    origami,
+    sl3,
+    medshift,
+    medoutcon,
+    lmtp,
+    lmcmtp
+Suggests:
+    nnls,
+    Rsolnp,
+    nloptr,
+    speedglm,
+    glmnet,
+    earth,
+    ranger,
+    hal9001,
+    SuperLearner,
+    mlr3extralearners
+Remotes:
+    github::BS1125/CMAverse
+    github::tlverse/sl3,
+    github::nhejazi/medshift,
+    github::nhejazi/medoutcon,
+    github::nt-williams/mlr3superlearner@Screener,
+    github::nt-williams/lcmmtp,
+    github::mlr-org/mlr3extralearners@*release
diff --git a/_quarto.yml b/_quarto.yml
index d079611..636aeac 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -8,6 +8,8 @@ book:
     - "Daniel B. Ibsen"
     - "Omar Silverman"
     - "Jie Zhang"
+    - "Nima Hejazi"
+    - "Iván Díaz"
   site-url: "https://analysis-of-mechanisms.netlify.app"
   repo-url: "https://github.com/steno-aarhus/mediation-analysis-course"
   chapters:
@@ -25,10 +27,19 @@ book:
       - sessions/causal-mediation-analysis-survival-outcomes.qmd
       - sessions/causal-mediation-analysis-sensitivity-analysis.qmd
       - sessions/reporting-guidelines.qmd
+      - sessions/mcma_preface.qmd
+      - sessions/mcma_effects_defn.qmd
       - sessions/group_work_day2.qmd
+      - sessions/mcma_choosing_effects.qmd
+      - sessions/mcma_estim_prelims.qmd
+      - sessions/mcma_estim_natural_interv.qmd
+      - sessions/mcma_estim_walkthru.qmd
+      - sessions/mcma_longit_effects.qmd
       - sessions/NetCoupler-example.qmd
   appendices:
     - appendices/references.qmd
+    - appendices/mcma_stoch_effects.qmd
+    - appendices/mcma_addl_readings.qmd
     - LICENSE.md
   page-footer:
     center:
@@ -45,7 +56,9 @@ editor:
     canonical: true
 
 format:
-  r3-theme-html
+  r3-theme-html:
+    include-in-header:
+      - file: headers/mathjax.html
 
 execute:
   freeze: false
diff --git a/appendices/mcma_addl_readings.qmd b/appendices/mcma_addl_readings.qmd
new file mode 100644
index 0000000..049c90f
--- /dev/null
+++ b/appendices/mcma_addl_readings.qmd
@@ -0,0 +1,63 @@
+# Appendix: Additional topics of interest
+
+The literature on mediation analysis has grown considerably in the last few
+decades and there are now many novel methods to tackle important questions with
+complex data structures. While we are unable to cover all these interesting
+methods in this workshop, here we provide a few references for further reading.
+
+This list is not meant to be comprehensive, just some of our own work and some
+work by others that we know and consider interesting.
+
+## Mediation with multiple mediators and multiple intermediate confounders
+
+Note that the [`medoutcon` `R` package](https://github.com/nhejazi/medoutcon)
+works for multiple mediators but is limited to settings with only a single,
+binary intermediate confounder. If your data scenario includes multiple
+mediators *and* *multiple* intermediate confounders, you should consider using
+the [`HDmediation` `R` package](https://github.com/nt-williams/HDmediation)
+instead.
+
+- [Practical causal mediation analysis: extending nonparametric estimators to
+  estimate the mediated effects of housing voucher receipt on adolescent risk
+  behavior](https://arxiv.org/abs/2212.08164) By Kara E. Rudolph, Nicholas
+  Williams, and Iván Díaz
+
+
+## Mediation with monotonicity of A-Z relationship
+
+- [On identification of natural direct effects when a confounder of the mediator
+  is directly affected by
+  exposure](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4230499/) by Eric J.
+  Tchetgen Tchetgen and Tyler J. VanderWeele
+- [Efficient and flexible estimation of natural mediation effects under
+  intermediate confounding and monotonicity
+  constraints](https://onlinelibrary.wiley.com/doi/abs/10.1111/biom.13850) by
+  Kara E. Rudolph and Iván Díaz
+
+## Mediation with instrumental variables
+
+- [Direct and indirect treatment effects–causal chains and mediation analysis
+  with instrumental
+  variables](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/rssb.12232)
+  by Markus Frolich and Martin Huber
+- [Causal mediation with instrumental
+  variables](https://arxiv.org/abs/2112.13898) by Kara E. Rudolph, Nicholas
+  Williams, and Iván Díaz
+
+## Mediation with separable effects
+
+* [An Interventionist Approach to Mediation
+  Analysis](https://dl.acm.org/doi/abs/10.1145/3501714.3501754) by James M.
+  Robins, Thomas S. Richardson, and Ilya Shpitser
+* [Conditional Separable
+  Effects](https://www.tandfonline.com/doi/abs/10.1080/01621459.2022.2071276)
+  by Mats J. Stensrud, James M. Robins, Aaron Sarvet, Eric J. Tchetgen
+  Tchetgen, and Jessica G. Young
+* [Separable Effects for Causal Inference in the Presence of Competing
+  Events](https://www.tandfonline.com/doi/abs/10.1080/01621459.2020.1765783) by
+  Mats J. Stensrud, Jessica G. Young, Vanessa Didelez, James M. Robins, and
+  Miguel A. Hernán
+* [A Generalized Theory of Separable Effects in Competing Event
+  Settings](https://link.springer.com/article/10.1007/s10985-021-09530-8) by
+  Mats J. Stensrud, Miguel A. Hernán, Eric J. Tchetgen Tchetgen, James M.
+  Robins, Vanessa Didelez, and Jessica G. Young
diff --git a/appendices/mcma_stoch_effects.qmd b/appendices/mcma_stoch_effects.qmd
new file mode 100644
index 0000000..c5c4f9b
--- /dev/null
+++ b/appendices/mcma_stoch_effects.qmd
@@ -0,0 +1,291 @@
+# Appendix: Stochastic direct and indirect effects
+
+## Definition of the effects
+
+Consider the following directed acyclic graph.
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under no intermediate confounders of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+## Motivation for stochastic interventions
+
+- So far we have discussed controlled, natural, and interventional (in)direct
+  effects
+- These effects require that $0 < \P(A=1\mid W) < 1$
+- They are defined only for binary exposures
+- _What can we do when the positivity assumption does not hold or the exposure
+  is continuous?_
+- Solution: We can use stochastic effects
+
+## Definition of stochastic effects
+
+There are two possible ways of defining stochastic effects:
+
+- Consider the effect of an intervention where the exposure is drawn from a
+  distribution
+  - For example incremental propensity score interventions
+- Consider the effect of an intervention where the post-intervention exposure is
+  a function of the actually received exposure
+  - For example modified treatment policies
+- In both cases $A \mid W$ is a non-deterministic intervention, thus the name
+  _stochastic intervention_
+
+### Example: incremental propensity score interventions (IPSI) [@kennedy2018nonparametric] {#ipsi}
+
+#### Definition of the intervention {.unnumbered}
+
+- Assume $A$ is binary, and $\P(A=1\mid W=w) = g(1\mid w)$ is the propensity score
+- Consider an intervention in which each individual receives the intervention
+  with probability $g_\delta(1\mid w)$, equal to
+  \begin{equation*}
+    g_\delta(1\mid w)=\frac{\delta g(1\mid w)}{\delta g(1\mid w) +
+    1 - g(1\mid w)}
+  \end{equation*}
+- e.g., draw the post-intervention exposure from a Bernoulli variable with
+  probability $g_\delta(1\mid w)$
+- The value $\delta$ is user given
+- Let $A_\delta$ denote the post-intervention exposure distribution
+- Some algebra shows that $\delta$ is an odds ratio comparing the pre- and
+  post-intervention exposure distributions
+  \begin{equation*}
+    \delta = \frac{\text{odds}(A_\delta = 1\mid W=w)}
+    {\text{odds}(A = 1\mid W=w)}
+  \end{equation*}
+- Interpretation: _what would happen in a
+  world where the odds of receiving treatment is increased by $\delta$_
+- Let $Y_{A_\delta}$ denote the outcome in this hypothetical world
+
+#### Illustrative application for IPSIs
+
+- Consider the effect of participation in sports on children's BMI
+- Mediation through snacking, exercising, etc.
+- Intervention: for each individual, increase the odds of participating in
+  sports by $\delta=2$
+- The post-intervention exposure is a draw $A_\delta$ from a Bernoulli
+  distribution with probability $g_\delta(1\mid w)$
+
+### Example: modified treatment policies (MTP) [@diaz2020causal] {.unnumbered}
+
+#### Definition of the intervention {.unnumbered}
+
+- Consider a continuous exposure $A$ taking values in the real numbers
+- Consider an intervention that assigns exposure as $A_\delta = A - \delta$
+- Example: $A$ is pollution measured as $PM_{2.5}$ and you are interested in an
+  intervention that reduces $PM_{2.5}$ concentration by some amount $\delta$
+
+### Mediation analysis for stochastic interventions
+
+- The total effect of an IPSI can be computed as a contrast of the outcome under
+  intervention vs no intervention:
+  \begin{equation*}
+    \psi = \E[Y_{A_\delta} - Y]
+  \end{equation*}
+- Recall the NPSEM
+  \begin{align*}
+    W & = f_W(U_W)\\
+    A & = f_A(W, U_A)\\
+    M & = f_M(W, A, U_M)\\
+    Y & = f_Y(W, A, M, U_Y)
+  \end{align*}
+- From this we have
+  \begin{align*}
+  M_{A_\delta} & = f_M(W, A_\delta, U_M)\\
+  Y_{A_\delta} & = f_Y(W, A_\delta, M_{A_\delta}, U_Y)
+  \end{align*}
+
+- Thus, we have $Y_{A_\delta} = Y_{A_\delta, M_{A_\delta}}$ and $Y =
+  Y_{A,M_{A}}$
+- Let us introduce the counterfactual $Y_{A_\delta, M}$, interpreted as the
+  outcome observed in a world where the intervention on $A$ is performed but the
+  mediator is fixed at the value it would have taken under no intervention:
+  \[Y_{A_\delta, M}  = f_Y(W, A_\delta, M, U_Y)\]
+- Then we can decompose the total effect into:
+  \begin{align*}
+    \E[Y&_{A_\delta,M_{A_\delta}} - Y_{A,M_A}] = \\
+    &\underbrace{\E[Y_{\color{red}{A_\delta},\color{blue}{M_{A_\delta}}} -
+      Y_{\color{red}{A_\delta},\color{blue}{M}}]}_{\text{stochastic natural
+        indirect effect}} +
+      \underbrace{\E[Y_{\color{blue}{A_\delta},\color{red}{M}} -
+      Y_{\color{blue}{A},\color{red}{M}}]}_{\text{stochastic natural direct
+        effect}}
+  \end{align*}
+
+## Identification assumptions
+
+- Confounder assumptions:
+  + $A \indep Y_{a,m} \mid W$
+  + $M \indep Y_{a,m} \mid W, A$
+- No confounder of $M\rightarrow Y$ affected by $A$
+- Positivity assumptions:
+  + If $g_\delta(a \mid w)>0$ then $g(a \mid w)>0$
+  + If $\P(M=m\mid W=w)>0$ then $\P(M=m\mid A=a,W=w)>0$
+
+Under these assumptions, stochastic effects are identified as follows
+
+- The indirect effect can be identified as follows
+  \begin{align*}
+  \E&(Y_{A_\delta} - Y_{A_\delta, M}) =\\
+  &\E\left[\color{Goldenrod}{\sum_{a}\color{ForestGreen}{\{\E(Y\mid A=a, W)
+    -\E(Y\mid A=a, M, W)\}}g_\delta(a\mid W)}\right]
+  \end{align*}
+
+- The direct effect can be identified as follows
+  \begin{align*}
+  \E&(Y_{A_\delta} - Y_{A_\delta, M}) =\\
+  &\E\left[\color{Goldenrod}{\sum_{a}\color{ForestGreen}{\{\E(Y\mid A=a, M, W)
+    - Y\}}g_\delta(a\mid W)}\right]
+  \end{align*}
+
+- Let's dissect the formula for the indirect effect in R:
+  ```{r}
+  n <- 1e6
+  w <- rnorm(n)
+  a <- rbinom(n, 1, plogis(1 + w))
+  m <- rnorm(n, w + a)
+  y <- rnorm(n, w + a + m)
+  ```
+
+- First, fit regressions of the outcome on $(A,W)$ and $(M,A,W)$:
+  ```{r}
+  fit_y1 <- lm(y ~ m + a + w)
+  fit_y2 <- lm(y ~ a + w)
+  ```
+
+- Get predictions fixing $A=a$ for all possible values $a$
+  ```{r}
+  pred_y1_a1 <- predict(fit_y1, newdata = data.frame(a = 1, m, w))
+  pred_y1_a0 <- predict(fit_y1, newdata = data.frame(a = 0, m, w))
+  pred_y2_a1 <- predict(fit_y2, newdata = data.frame(a = 1, w))
+  pred_y2_a0 <- predict(fit_y2, newdata = data.frame(a = 0, w))
+  ```
+- Compute \[\color{ForestGreen}{\{\E(Y\mid A=a, W)-\E(Y\mid A=a, M, W)\}}\] for
+  each value $a$
+  ```{r}
+  pseudo_a1 <- pred_y2_a1 - pred_y1_a1
+  pseudo_a0 <- pred_y2_a0 - pred_y1_a0
+  ```
+
+- Estimate the propensity score $g(1\mid w)$ and evaluate the post-intervention
+  propensity score $g_\delta(1\mid w)$
+  ```{r}
+  pscore_fit <- glm(a ~ w, family = binomial())
+  pscore <- predict(pscore_fit, type = 'response')
+  ## How do the intervention vs observed propensity score compare
+  pscore_delta <- 2 * pscore / (2 * pscore + 1 - pscore)
+  ```
+- What do the post-intervention propensity scores look like?
+  ```{r}
+  plot(pscore, pscore_delta, xlab = 'Observed prop. score',
+       ylab = 'Prop. score under intervention')
+  abline(0, 1)
+  ```
+
+## What are the odds of exposure under intervention vs real world?
+
+```{r}
+odds <- (pscore_delta / (1 - pscore_delta)) / (pscore / (1 - pscore))
+summary(odds)
+```
+
+- Compute the sum
+  \begin{equation*}
+    \color{Goldenrod}{\sum_{a}\color{ForestGreen}{\{\E(Y\mid A=a, W) -
+      \E(Y\mid A=a, M, W)\}}g_\delta(a\mid W)}
+  \end{equation*}
+
+  ```{r}
+  indirect <- pseudo_a1 * pscore_delta + pseudo_a0 * (1 - pscore_delta)
+  ```
+
+- The average of this value is the indirect effect
+  ```{r}
+  ## E[Y(Adelta) - Y(Adelta, M)]
+  mean(indirect)
+  ```
+
+- The direct effect is
+  \begin{align*}
+    \E&(Y_{A_\delta} - Y_{A_\delta, M}) =\\
+    &\E\left[\color{Goldenrod}{\sum_{a}\color{ForestGreen}{\{\E(Y\mid A=a, M,
+      W) - Y\}}g_\delta(a\mid W)}\right]
+  \end{align*}
+
+- Which can be computed as
+  ```{r}
+  direct <- (pred_y1_a1 - y) * pscore_delta +
+         (pred_y1_a0 - y) * (1 - pscore_delta)
+  mean(direct)
+  ```
+
+## Summary
+
+- Stochastic (in)direct effects
+  - Relax the positivity assumption
+  - Can be defined for non-binary exposures
+  - Do not require a cross-world assumption
+
+- Still require the absence of intermediate confounders
+  - But, compared to the NDE and NIE, we can design a randomized study where
+    identifiability assumptions hold, at least in principle
+  - There is a version of these effects that can accommodate intermediate
+    confounders [@hejazi2020nonparametric]
+  - `R` implementation to be released soon...stay tuned!
diff --git a/appendices/references.qmd b/appendices/references.qmd
index e69de29..8e6e2e7 100644
--- a/appendices/references.qmd
+++ b/appendices/references.qmd
@@ -0,0 +1,9 @@
+---
+nocite: |
+  @*
+---
+
+# References {.unnumbered}
+
+::: {#refs}
+:::
diff --git a/headers/mathjax.html b/headers/mathjax.html
new file mode 100644
index 0000000..e965a03
--- /dev/null
+++ b/headers/mathjax.html
@@ -0,0 +1,37 @@
+<script>
+  MathJax = {
+    tex: {
+      tags: 'ams',  // should be 'ams', 'none', or 'all'
+      macros: {
+        E: '{\\mathbb{E}}',
+        V: '{\\mathbb{V}}',
+        P: '{\\mathsf{P}}',
+        M: '{\\mathsf{M}}',
+        Pr: '{\\mathbb{P}}',
+        R: '{\\mathbb{R}}',
+        I: '{\\mathbb{I}}',
+        1: '{\\mathbb{1}}',
+        D: '{\\mathcal{D}}',
+        F: '{\\mathcal{F}}',
+        M: '{\\mathcal{M}}',
+        C: '{\\mathcal{C}}',
+        G: '{\\mathcal{G}}',
+        X: '{\\mathcal{X}}',
+        Z: '{\\mathcal{Z}}',
+        Hold: '{\\mathcal{H}}',
+        lik: '{\\mathcal{L}}',
+        logit: '{\\operatorname{logit}}',
+        expit: '{\\operatorname{expit}}',
+        argmin: '{\\operatorname*{arg\\,min}}',
+        argmax: '{\\operatorname*{arg\\,max}}',
+        indep: '{\\perp\\!\\!\\!\\perp}',
+        notindep: '{\\centernot{\\perp\\!\\!\\!\\perp}}',
+        coloneqq: '{\\mathrel{\\vcenter{:}}=}',
+        iid: '{\\,\\, \\mathbin{\\overset{\\text{iid}}{\\sim}} \\,\\,}',
+        asto: '{\\mathbin{\\underset{a.s.}{\\to}}}',
+        pto: '{\\mathbin{\\underset{p}{\\to}}}',
+        dto: '{\\mathbin{\\underset{d}{\\to}}}'
+      }
+    }
+  };
+</script>
diff --git a/headers/preamble.tex b/headers/preamble.tex
new file mode 100644
index 0000000..6e2dee3
--- /dev/null
+++ b/headers/preamble.tex
@@ -0,0 +1,43 @@
+% packages
+\usepackage[english]{babel}
+\usepackage{listings}
+\usepackage{enumitem}
+\usepackage{float}
+\usepackage{fontenc}
+\usepackage{inputenc}
+\usepackage{mathtools}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{amssymb}
+\usepackage{amsthm}
+\usepackage{bm}
+\usepackage{centernot}
+
+% math macros
+\newcommand{\D}{\mathcal{D}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\V}{\mathbb{V}}
+\newcommand{\F}{\mathcal{F}}
+\renewcommand{\P}{\mathsf{P}}
+\newcommand{\M}{\mathcal{M}}
+\newcommand{\C}{\mathcal{C}}
+\newcommand{\G}{\mathcal{G}}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\X}{\mathcal{X}}
+\newcommand{\Z}{\mathcal{Z}}
+\newcommand{\Hold}{\mathcal{H}}
+\newcommand{\logit}{\text{logit}}
+\newcommand{\expit}{\text{expit}}
+\newcommand{\like}{\mathcal{L}}
+\renewcommand{\Pr}{\mathbb{P}}
+\newcommand{\I}{\mathbb{I}}
+\newcommand{\1}{\mathbb{1}}
+\newcommand{\indep}{\perp\!\!\!\perp}
+\newcommand{\notindep}{\centernot{\perp\!\!\!\perp}}
+\DeclareMathOperator*{\argmin}{\arg\!\min}
+\DeclareMathOperator*{\argmax}{\arg\!\max}
+\newcommand{\iid}{\,\, \mathbin{\overset{\text{iid}}{\sim}} \,\, }
+\newcommand{\asto}{\mathbin{\underset{a.s.}{\to}}}
+\newcommand{\pto}{\mathbin{\underset{p}{\to}}}
+\newcommand{\dto}{\mathbin{\underset{d}{\to}}}
+
diff --git a/images/ctndag.pdf b/images/ctndag.pdf
new file mode 100644
index 0000000..ecedc5f
Binary files /dev/null and b/images/ctndag.pdf differ
diff --git a/images/ctndag.png b/images/ctndag.png
new file mode 100644
index 0000000..cbe2ced
Binary files /dev/null and b/images/ctndag.png differ
diff --git a/images/graphic4a.pdf b/images/graphic4a.pdf
new file mode 100644
index 0000000..b1aff33
Binary files /dev/null and b/images/graphic4a.pdf differ
diff --git a/images/graphic4a.png b/images/graphic4a.png
new file mode 100644
index 0000000..5886ab3
Binary files /dev/null and b/images/graphic4a.png differ
diff --git a/images/graphic4a3.pdf b/images/graphic4a3.pdf
new file mode 100644
index 0000000..71e5bef
Binary files /dev/null and b/images/graphic4a3.pdf differ
diff --git a/images/graphic4a3.png b/images/graphic4a3.png
new file mode 100644
index 0000000..1fb758e
Binary files /dev/null and b/images/graphic4a3.png differ
diff --git a/images/graphic4b.pdf b/images/graphic4b.pdf
new file mode 100644
index 0000000..1b80727
Binary files /dev/null and b/images/graphic4b.pdf differ
diff --git a/images/graphic4b.png b/images/graphic4b.png
new file mode 100644
index 0000000..bfd7f6c
Binary files /dev/null and b/images/graphic4b.png differ
diff --git a/images/medDAG2.pdf b/images/medDAG2.pdf
new file mode 100644
index 0000000..e5f6c2d
Binary files /dev/null and b/images/medDAG2.pdf differ
diff --git a/images/medDAG2.png b/images/medDAG2.png
new file mode 100644
index 0000000..7b1d942
Binary files /dev/null and b/images/medDAG2.png differ
diff --git a/images/meddagnoz.pdf b/images/meddagnoz.pdf
new file mode 100644
index 0000000..4f8d291
Binary files /dev/null and b/images/meddagnoz.pdf differ
diff --git a/images/meddagnoz.png b/images/meddagnoz.png
new file mode 100644
index 0000000..30d4209
Binary files /dev/null and b/images/meddagnoz.png differ
diff --git a/images/table1.pdf b/images/table1.pdf
new file mode 100644
index 0000000..c2ac229
Binary files /dev/null and b/images/table1.pdf differ
diff --git a/images/table1.png b/images/table1.png
new file mode 100644
index 0000000..791a32b
Binary files /dev/null and b/images/table1.png differ
diff --git a/images/tmleesttotal.pdf b/images/tmleesttotal.pdf
new file mode 100644
index 0000000..4a5ff5c
Binary files /dev/null and b/images/tmleesttotal.pdf differ
diff --git a/images/tmleesttotal.png b/images/tmleesttotal.png
new file mode 100644
index 0000000..0a3c832
Binary files /dev/null and b/images/tmleesttotal.png differ
diff --git a/includes/papers.bib b/includes/papers.bib
index 3eecab5..57f9c28 100644
--- a/includes/papers.bib
+++ b/includes/papers.bib
@@ -553,3 +553,1029 @@ @article{vanderweele_sensitivity_2017
 	pages = {268--274},
 	file = {Submitted Version:C\:\\Users\\au302898\\Zotero\\storage\\5RG8AH7Q\\VanderWeele and Ding - 2017 - Sensitivity Analysis in Observational Research In.pdf:application/pdf},
 }
+
+@article{tchetgen2014identification,
+  title={On identification of natural direct effects when a confounder of the
+    mediator is directly affected by exposure},
+  author={{Tchetgen Tchetgen}, Eric J and VanderWeele, Tyler J},
+  journal={Epidemiology},
+  volume={25},
+  number={2},
+  pages={282},
+  year={2014},
+  publisher={NIH Public Access}
+}
+
+@article{rudolph2020explaining,
+  title={Explaining differential effects of medication for opioid use disorder
+    using a novel approach incorporating mediating variables},
+  author={Rudolph, Kara and Diaz, Ivan and Hejazi, Nima and {van der Laan},
+    Mark and Luo, Sean and Shulman, Matisyahu and Campbell, Aimee and Rotrosen,
+    John and Nunes, Edward},
+  journal={Addiction},
+  year={2020},
+  publisher={Wiley Online Library}
+}
+
+@article{rudolph2019causal,
+  title={Causal mediation analysis with observational data: considerations and
+    illustration examining mechanisms linking neighborhood poverty to
+    adolescent substance use},
+  author={Rudolph, Kara E and Goin, Dana E and Paksarian, Diana and Crowder,
+    Rebecca and Merikangas, Kathleen R and Stuart, Elizabeth A},
+  journal={American journal of epidemiology},
+  volume={188},
+  number={3},
+  pages={598--608},
+  year={2019},
+  publisher={Oxford University Press}
+}
+
+@book{xie2015,
+  title = {Dynamic Documents with {R} and knitr},
+  author = {Yihui Xie},
+  publisher = {Chapman and Hall/CRC},
+  address = {Boca Raton, Florida},
+  year = {2015},
+  edition = {2nd},
+  note = {ISBN 978-1498716963},
+  url = {http://yihui.name/knitr/},
+}
+
+@incollection{diaz2018stochastic,
+  title={Stochastic Treatment Regimes},
+  author={D{\'\i}az, Iv{\'a}n and {van der Laan}, Mark J},
+  booktitle={Targeted Learning in Data Science: Causal Inference for Complex
+    Longitudinal Studies},
+  pages={167--180},
+  year={2018},
+  publisher={Springer Science \& Business Media}
+}
+
+@article{diaz2012population,
+  title={Population intervention causal effects based on stochastic
+    interventions},
+  author={D{\'\i}az, Iv{\'a}n and {van der Laan}, Mark J},
+  journal={Biometrics},
+  volume={68},
+  number={2},
+  pages={541--549},
+  year={2012},
+  publisher={Wiley Online Library}
+}
+
+@book{vdl2018targeted,
+  title={Targeted Learning in Data Science: Causal Inference for Complex
+    Longitudinal Studies},
+  author={{van der Laan}, Mark J and Rose, Sherri},
+  year={2018},
+  publisher={Springer Science \& Business Media}
+}
+
+@book{vdl2011targeted,
+  title={Targeted learning: causal inference for observational and experimental
+         data},
+  author={{van der Laan}, Mark J and Rose, Sherri},
+  year={2011},
+  publisher={Springer Science \& Business Media}
+}
+@article{diaz2020causal,
+  title={Causal mediation analysis for stochastic interventions},
+  author={D{\'\i}az, Iv{\'a}n and Hejazi, Nima S},
+  journal={Journal of the Royal Statistical Society: Series B (Statistical
+    Methodology)},
+  volume={82},
+  number={3},
+  pages={661--683},
+  year={2020},
+  publisher={Wiley Online Library}
+}
+
+@article{diaz2011super,
+  title={Super learner based conditional density estimation with application to
+    marginal structural models},
+  author={D{\'\i}az, Iv{\'a}n and {van der Laan}, Mark J},
+  journal={The International Journal of Biostatistics},
+  volume={7},
+  number={1},
+  pages={1--20},
+  year={2011},
+  publisher={De Gruyter}
+}
+
+@incollection{diaz2018stochastic,
+  title={Stochastic Treatment Regimes},
+  author={D{\'\i}az, Iv{\'a}n and {van der Laan}, Mark J},
+  booktitle={Targeted Learning in Data Science: Causal Inference for Complex
+    Longitudinal Studies},
+  pages={167--180},
+  year={2018},
+  publisher={Springer Science \& Business Media}
+}
+
+@article{diaz2012population,
+  title={Population intervention causal effects based on stochastic
+    interventions},
+  author={D{\'\i}az, Iv{\'a}n and {van der Laan}, Mark J},
+  journal={Biometrics},
+  volume={68},
+  number={2},
+  pages={541--549},
+  year={2012},
+  publisher={Wiley Online Library}
+}
+
+@article{vdl2006targeted,
+  title={Targeted maximum likelihood learning},
+  author={{van der Laan}, Mark J and Rubin, Daniel},
+  journal={The International Journal of Biostatistics},
+  volume={2},
+  number={1},
+  year={2006}
+}
+
+@article{vdl2004asymptotic,
+  title={Asymptotic optimality of likelihood-based cross-validation},
+  author={{van der Laan}, Mark J and Dudoit, Sandrine and Keles, Sunduz},
+  journal={Statistical Applications in Genetics and Molecular Biology},
+  volume={3},
+  number={1},
+  pages={1--23},
+  year={2004}
+}
+
+@article{dudoit2005asymptotics,
+  title={Asymptotics of cross-validated risk estimation in estimator selection
+    and performance assessment},
+  author={Dudoit, Sandrine and {van der Laan}, Mark J},
+  journal={Statistical Methodology},
+  volume={2},
+  number={2},
+  pages={131--154},
+  year={2005},
+  publisher={Elsevier}
+}
+
+@article{vdl2007super,
+  title={{Super Learner}},
+  author={{van der Laan}, Mark J and Polley, Eric C and Hubbard, Alan E},
+  journal={Statistical Applications in Genetics and Molecular Biology},
+  volume={6},
+  number={1},
+  year={2007}
+}
+
+@article{breiman1996stacked,
+  title={Stacked regressions},
+  author={Breiman, Leo},
+  journal={Machine Learning},
+  volume={24},
+  number={1},
+  pages={49--64},
+  year={1996},
+  publisher={Springer}
+}
+
+@book{pearl2009causality,
+  title={Causality: Models, Reasoning, and Inference},
+  author={Pearl, Judea},
+  year={2009},
+  publisher={Cambridge University Press}
+}
+
+@article{holland1986statistics,
+  title={Statistics and causal inference},
+  author={Holland, Paul W},
+  journal={Journal of the American statistical Association},
+  volume={81},
+  number={396},
+  pages={945--960},
+  year={1986},
+  publisher={Taylor \& Francis}
+}
+
+@article{rosenbaum1983central,
+  title={The central role of the propensity score in observational studies for
+    causal effects},
+  author={Rosenbaum, Paul R and Rubin, Donald B},
+  journal={Biometrika},
+  volume={70},
+  number={1},
+  pages={41--55},
+  year={1983},
+  publisher={Biometrika Trust}
+}
+
+@article{haneuse2013estimation,
+  title={Estimation of the effect of interventions that modify the received
+    treatment},
+  author={Haneuse, Sebastian and Rotnitzky, Andrea},
+  journal={Statistics in medicine},
+  volume={32},
+  number={30},
+  pages={5260--5277},
+  year={2013},
+  publisher={Wiley Online Library}
+}
+
+@article{young2014identification,
+  title={Identification, estimation and approximation of risk under
+    interventions that depend on the natural value of treatment using
+    observational data},
+  author={Young, Jessica G and Hern{\'a}n, Miguel A and Robins, James M},
+  journal={Epidemiologic methods},
+  volume={3},
+  number={1},
+  pages={1--19},
+  year={2014},
+  publisher={De Gruyter}
+}
+
+@article{rubin2005causal,
+  title={Causal inference using potential outcomes: Design, modeling,
+    decisions},
+  author={Rubin, Donald B},
+  journal={Journal of the American Statistical Association},
+  volume={100},
+  number={469},
+  pages={322--331},
+  year={2005},
+  publisher={Taylor \& Francis}
+}
+
+@article{rubin1978bayesian,
+  title={Bayesian inference for causal effects: The role of randomization},
+  author={Rubin, Donald B},
+  journal={The Annals of statistics},
+  pages={34--58},
+  year={1978},
+  publisher={JSTOR}
+}
+
+@article{rubin1980randomization,
+  title={Randomization analysis of experimental data: The Fisher randomization
+    test comment},
+  author={Rubin, Donald B},
+  journal={Journal of the American Statistical Association},
+  volume={75},
+  number={371},
+  pages={591--593},
+  year={1980},
+  publisher={JSTOR}
+}
+
+@article{kennedy2017non,
+  title={Non-parametric methods for doubly robust estimation of continuous
+    treatment effects},
+  author={Kennedy, Edward H and Ma, Zongming and McHugh, Matthew D and Small,
+    Dylan S},
+  journal={Journal of the Royal Statistical Society: Series B (Statistical
+    Methodology)},
+  volume={79},
+  number={4},
+  pages={1229--1245},
+  year={2017},
+  publisher={Wiley Online Library}
+}
+
+@article{wolpert1992stacked,
+  title={Stacked generalization},
+  author={Wolpert, David H},
+  journal={Neural networks},
+  volume={5},
+  number={2},
+  pages={241--259},
+  year={1992},
+  publisher={Elsevier}
+}
+
+@article{kennedy2018nonparametric,
+  title={Nonparametric causal effects based on incremental propensity score
+    interventions},
+  author={Kennedy, Edward H},
+  journal={Journal of the American Statistical Association},
+  number={just-accepted},
+  year={2018},
+  publisher={Taylor \& Francis}
+}
+
+@article{cvtmle2010,
+  author = {Zheng, W. and {van der Laan}, M.~J},
+  title = "{Asymptotic Theory for Cross-validated Targeted Maximum Likelihood
+    Estimation}",
+  journal = {U.C. Berkeley Division of Biostatistics Working Paper Series.},
+  year = 2010,
+  adsurl = {https://biostats.bepress.com/ucbbiostat/paper273},
+}
+
+@article{luedtke2016super,
+  Author = {Luedtke, A. and {van der Laan}, M.~J},
+  Journal = {International Journal of Biostatistics},
+  Note = {PMID: 27227726},
+  Number = {1},
+  Pages = {305--332},
+  Title = {Super-learning of an optimal dynamic treatment rule},
+  Volume = {12},
+  Year = {2016}
+}
+
+@article{vanderLaanLuedtke15,
+  Author = {{van der Laan}, M.~J and Luedtke, A.},
+  Journal = {Journal of Causal Inference},
+  Note = {PMCID: PMC4517487},
+  Number = {1},
+  Pages = {61--95},
+  Title = {Targeted learning of the mean outcome under an optimal dynamic
+    treatment rule},
+  Volume = {3},
+  Year = {2015}
+}
+
+@book{Sutton1998,
+  title={Introduction to reinforcement learning},
+  author={Sutton, Richard S and Barto, Andrew G and others},
+  volume={135},
+  year={1998},
+  publisher={MIT press Cambridge}
+}
+
+@article{murphy2003,
+  title={Optimal dynamic treatment regimes},
+  author={Murphy, Susan A},
+  journal={Journal of the Royal Statistical Society: Series B (Statistical
+    Methodology)},
+  volume={65},
+  number={2},
+  pages={331--355},
+  year={2003},
+  publisher={Wiley Online Library}
+}
+
+@Inbook{robins2004,
+  author="Robins, James M.",
+  editor="Lin, D. Y. and Heagerty, P. J.",
+  title="Optimal Structural Nested Models for Optimal Sequential Decisions",
+  bookTitle="Proceedings of the Second Seattle Symposium in Biostatistics:
+    Analysis of Correlated Data",
+  year="2004",
+  publisher="Springer New York",
+  address="New York, NY",
+  pages="189--326",
+  doi="10.1007/978-1-4419-9076-1_11",
+  url="https://doi.org/10.1007/978-1-4419-9076-1_11"
+}
+
+@article{vdl2014entering,
+  title={Entering the era of data science: Targeted learning and the
+    integration of statistics and computational data analysis},
+  author={{van der Laan}, Mark J and Starmans, Richard JCM},
+  journal={Advances in Statistics},
+  volume={2014},
+  year={2014},
+  publisher={Hindawi}
+}
+
+@article{vdl2003unified,
+  title={Unified cross-validation methodology for selection among estimators
+    and a general cross-validated adaptive epsilon-net estimator: Finite sample
+    oracle inequalities and examples},
+  author={{van der Laan}, Mark J and Dudoit, Sandrine},
+  year={2003},
+  publisher={bepress}
+}
+
+@article{wolpert1992stacked,
+  title={Stacked generalization},
+  author={Wolpert, David H},
+  journal={Neural networks},
+  volume={5},
+  number={2},
+  pages={241--259},
+  year={1992},
+  publisher={Elsevier}
+}
+
+@article{naturenews_2015,
+  author = {Nature Editorial},
+  title={Let’s think about cognitive bias},
+  journal={Nature}, publisher={Springer Nature}, year={2015}, month={Oct},
+  volume={526},
+  number={7572}
+}
+
+@article{peng2015reproducibility,
+  title={The reproducibility crisis in science: A statistical counterattack},
+  author={Peng, Roger},
+  journal={Significance},
+  volume={12},
+  number={3},
+  pages={30--32},
+  year={2015},
+  publisher={Wiley Online Library}
+}
+
+@article{munafo2017manifesto,
+  title={A manifesto for reproducible science},
+  author={Munaf{\`o}, Marcus R and Nosek, Brian A and Bishop, Dorothy VM and
+    Button, Katherine S and Chambers, Christopher D and Du Sert, Nathalie
+    Percie and Simonsohn, Uri and Wagenmakers, Eric-Jan and Ware, Jennifer J
+    and Ioannidis, John PA},
+  journal={Nature Human Behaviour},
+  volume={1},
+  number={1},
+  pages={0021},
+  year={2017},
+  publisher={Nature Publishing Group}
+}
+
+@article{naturenews2_2015,
+  author = {Nature Editorial},
+  title={How scientists fool themselves --- and how they can stop},
+  journal={Nature},
+  publisher={Springer Nature},
+  year={2015},
+  month={Oct},
+  volume={526},
+  number={7572}
+}
+
+@article{forstmeier2017detecting,
+  title={Detecting and avoiding likely false-positive findings--a practical
+    guide},
+  author={Forstmeier, Wolfgang and Wagenmakers, Eric-Jan and Parker, Timothy
+    H},
+  journal={Biological Reviews},
+  volume={92},
+  number={4},
+  pages={1941--1968},
+  year={2017},
+  publisher={Wiley Online Library}
+}
+
+@article{nosek2018preregistration,
+  title={The preregistration revolution},
+  author={Nosek, Brian A and Ebersole, Charles R and DeHaven, Alexander C and
+    Mellor, David T},
+  journal={Proceedings of the National Academy of Sciences},
+  volume={115},
+  number={11},
+  pages={2600--2606},
+  year={2018},
+  publisher={National Acad Sciences}
+}
+
+@article{baker2016there,
+  title={Is there a reproducibility crisis? A Nature survey lifts the lid on
+    how researchers view the crisis rocking science and what they think will
+    help},
+  author={Baker, Monya},
+  journal={Nature},
+  volume={533},
+  number={7604},
+  pages={452--455},
+  year={2016},
+  publisher={Nature Publishing Group}
+}
+
+@article{szucs2017null,
+  title={When null hypothesis significance testing is unsuitable for research:
+    a reassessment},
+  author={Szucs, Denes and Ioannidis, John},
+  journal={Frontiers in human neuroscience},
+  volume={11},
+  pages={390},
+  year={2017},
+  publisher={Frontiers}
+}
+
+@article{pullenayegum2016knowledge,
+  title={Knowledge translation in biostatistics: a survey of current practices,
+    preferences, and barriers to the dissemination and uptake of new
+      statistical methods},
+  author={Pullenayegum, Eleanor M and Platt, Robert W and Barwick, Melanie and
+    Feldman, Brian M and Offringa, Martin and Thabane, Lehana},
+  journal={Statistics in medicine},
+  volume={35},
+  number={6},
+  pages={805--818},
+  year={2016},
+  publisher={Wiley Online Library}
+}
+
+@article{stromberg2004write,
+  title={Why write statistical software? The case of robust statistical
+    methods},
+  author={Stromberg, Arnold and others},
+  journal={Journal of Statistical Software},
+  volume={10},
+  number={5},
+  pages={1--8},
+  year={2004}
+}
+
+@incollection{buckheit1995wavelab,
+  title={Wavelab and reproducible research},
+  author={Buckheit, Jonathan B and Donoho, David L},
+  booktitle={Wavelets and statistics},
+  pages={55--81},
+  year={1995},
+  publisher={Springer}
+}
+
+@book{kitzes2017practice,
+  title={The practice of reproducible research: case studies and lessons from
+    the data-intensive sciences},
+  author={Kitzes, Justin and Turek, Daniel and Deniz, Fatma},
+  year={2017},
+  publisher={Univ of California Press}
+}
+
+@article{luby2018effect,
+  title={Effect of water quality, sanitation, hand washing, and nutritional
+    interventions on child development in rural Bangladesh (WASH Benefits
+    Bangladesh): a cluster-randomised controlled trial},
+  author={Tofail, Fahmida and Fernald, Lia CH and Das, Kishor K and Rahman,
+    Mahbubur and Ahmed, Tahmeed and Jannat, Kaniz K and Unicomb, Leanne and
+    Arnold, Benjamin F and Ashraf, Sania and Winch, Peter J and others},
+  journal={The Lancet Child \& Adolescent Health},
+  volume={2},
+  number={4},
+  pages={255--268},
+  year={2018},
+  publisher={Elsevier}
+}
+
+@article{neyman1990,
+  ISSN = {08834237},
+  URL = {http://www.jstor.org/stable/2245382},
+  journal = {Statistical Science},
+  number = {4},
+  pages = {465--472},
+  publisher = {Institute of Mathematical Statistics},
+  title = {On the Application of Probability Theory to Agricultural
+    Experiments. Essay on Principles. Section 9.},
+  volume = {5},
+  year = {1990}
+}
+
+@article{robins1986,
+  title = "A new approach to causal inference in mortality studies with a
+    sustained exposure period—application to control of the healthy worker
+    survivor effect",
+  journal = "Mathematical Modelling",
+  volume = "7",
+  number = "9",
+  pages = "1393 - 1512",
+  year = "1986",
+  issn = "0270-0255",
+  doi = "https://doi.org/10.1016/0270-0255(86)90088-6",
+  url = "http://www.sciencedirect.com/science/article/pii/0270025586900886",
+  author = "James Robins"
+}
+
+@book{pearl2009,
+  author = {Pearl, Judea},
+  title = {Causality: Models, Reasoning and Inference},
+  year = {2009},
+  isbn = {052189560X, 9780521895606},
+  edition = {2nd},
+  publisher = {Cambridge University Press},
+  address = {New York, NY, USA},
+}
+
+@book{moodie2013,
+  Author = {Chakraborty, Bibhas and Moodie, Erica EM},
+  Title = {Statistical Methods for Dynamic Treatment Regimes: Reinforcement
+    Learning, Causal Inference, and Personalized Medicine (Statistics for
+    Biology and Health)},
+  Publisher = {Springer},
+  Year = {2013},
+  ISBN = {1461474272},
+}
+
+@article{laber2012,
+  author = {Zhang, Baqun and A Tsiatis, Anastasios and Davidian, Marie and
+    Zhang, Min and Laber, Eric},
+  title = {Estimating optimal treatment regimes from a classification
+    perspective},
+  journal = {Stat},
+  volume = {5},
+  number = {1},
+  pages = {278-278},
+  doi = {10.1002/sta4.124},
+  url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sta4.124},
+  eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sta4.124},
+  year = {2016}
+}
+
+@article{kosorok2012,
+  author = {Yingqi Zhao and Donglin Zeng and A John Rush and Michael R
+    Kosorok},
+  title = {Estimating Individualized Treatment Rules Using Outcome Weighted
+    Learning},
+  journal = {Journal of the American Statistical Association},
+  volume = {107},
+  number = {499},
+  pages = {1106-1118},
+  year  = {2012},
+  publisher = {Taylor & Francis},
+  doi = {10.1080/01621459.2012.695674},
+  note ={PMID: 23630406},
+  URL = {https://doi.org/10.1080/01621459.2012.695674},
+  eprint = {https://doi.org/10.1080/01621459.2012.695674}
+}
+
+@article{robins2014,
+  author = "Robins, James and Rotnitzky, Andrea",
+  doi = "10.1214/14-EJS908",
+  fjournal = "Electronic Journal of Statistics",
+  journal = "Electron. J. Statist.",
+  number = "1",
+  pages = "1273--1289",
+  publisher = "The Institute of Mathematical Statistics and the Bernoulli
+    Society",
+  title = "Discussion of “Dynamic treatment regimes: Technical challenges and
+    applications”",
+  url = "https://doi.org/10.1214/14-EJS908",
+  volume = "8",
+  year = "2014"
+}
+
+@article{breiman1996stacked,
+  title={Stacked regressions},
+  author={Breiman, Leo},
+  journal={Machine learning},
+  volume={24},
+  number={1},
+  pages={49--64},
+  year={1996},
+  publisher={Springer}
+}
+
+@article{van2006oracle,
+  title={Oracle inequalities for multi-fold cross validation},
+  author={{van der Vaart}, Aad W and Dudoit, Sandrine and {van der Laan}, Mark
+    J},
+  journal={Statistics \& Decisions},
+  volume={24},
+  number={3},
+  pages={351--371},
+  year={2006},
+  publisher={Oldenbourg Wissenschaftsverlag}
+}
+
+@article{stark2018cargo,
+  title={Cargo-cult statistics and scientific crisis},
+  author={Stark, Philip B and Saltelli, Andrea},
+  journal={Significance},
+  volume={15},
+  number={4},
+  pages={40--43},
+  year={2018},
+  publisher={Wiley Online Library}
+}
+
+@software{hejazi2020haldensify,
+  author = {Hejazi, Nima S and Benkeser, David C and {van der Laan}, Mark J},
+  title = {{haldensify}: Highly adaptive lasso conditional density estimation},
+  year  = {2020},
+  howpublished = {\url{https://github.com/nhejazi/haldensify}},
+  note = {{R} package version 0.0.5},
+  url = {https://doi.org/10.5281/zenodo.3698329},
+  doi = {10.5281/zenodo.3698329}
+}
+
+@software{coyle2020hal9001,
+  author = {Coyle, Jeremy R and Hejazi, Nima S and {van der Laan}, Mark J},
+  title = {{hal9001}: The scalable highly adaptive lasso},
+  year  = {2020},
+  url = {https://doi.org/10.5281/zenodo.3558313},
+  doi = {10.5281/zenodo.3558313},
+  note = {{R} package version 0.2.7}
+}
+
+@article{hejazi2020hal9001,
+  author = {Hejazi, Nima S and Coyle, Jeremy R and {van der Laan}, Mark J},
+  title = {{hal9001}: Scalable highly adaptive lasso regression in {R}},
+  year  = {2020},
+  url = {https://doi.org/10.21105/joss.02526},
+  doi = {10.21105/joss.02526},
+  journal = {Journal of Open Source Software},
+  publisher = {The Open Journal}
+}
+
+@article{hejazi2020efficient,
+  doi = {10.1111/biom.13375},
+  url = {https://arxiv.org/abs/2003.13771},
+  year = {2020},
+  publisher = {Wiley Online Library},
+  journal = {Biometrics},
+  volume = {},
+  number = {},
+  pages = {},
+  author = {Hejazi, Nima S and {van der Laan}, Mark J and Janes, Holly E and
+    Gilbert, Peter B and Benkeser, David C},
+  title = {Efficient nonparametric inference on the effects of stochastic
+    interventions under two-phase sampling, with applications to vaccine
+    efficacy trials}
+}
+
+@inproceedings{benkeser2016hal,
+  doi = {10.1109/dsaa.2016.93},
+  url = {https://doi.org/10.1109/dsaa.2016.93},
+  year  = {2016},
+  publisher = {{IEEE}},
+  author = {Benkeser, David and {van der Laan}, Mark J},
+  title = {The Highly Adaptive Lasso Estimator},
+  booktitle = {2016 {IEEE} International Conference on Data Science and
+    Advanced Analytics ({DSAA})}
+}
+
+@article{polley2010super,
+  title={Super learner in prediction},
+  author={Polley, Eric C and {van der Laan}, Mark J},
+  year={2010},
+  publisher={bepress}
+}
+
+@book{wickham2014advanced,
+  title={Advanced r},
+  author={Wickham, Hadley},
+  year={2014},
+  publisher={Chapman and Hall/CRC}
+}
+
+@article{sandercock1997international,
+  title={The International Stroke Trial (IST): a randomized trial of aspirin,
+    subcutaneous heparin, both, or neither among 19,435 patients with acute
+    ischemic stroke},
+  author={Sandercock, P and Collins, R and Counsell, C and Farrell, B and Peto,
+    R and Slattery, J and Warlow, C},
+  journal={Lancet},
+  volume={349},
+  number={9065},
+  pages={1569--1581},
+  year={1997}
+}
+
+@article{sandercock2011international,
+  title={The international stroke trial database},
+  author={Sandercock, Peter AG and Niewada, Maciej and Cz{\l}onkowska, Anna},
+  journal={Trials},
+  volume={12},
+  number={1},
+  pages={101},
+  year={2011},
+  publisher={BioMed Central}
+}
+
+@book{kalbfleisch2011statistical,
+  title={The statistical analysis of failure time data},
+  author={Kalbfleisch, John D and Prentice, Ross L},
+  volume={360},
+  year={2011},
+  publisher={John Wiley \& Sons}
+}
+
+@article{vaart2006oracle,
+  title={Oracle inequalities for multi-fold cross validation},
+  author={Van der Vaart, Aad W and Dudoit, Sandrine and van der Laan, Mark J},
+  journal={Statistics \& Decisions},
+  volume={24},
+  number={3},
+  pages={351--371},
+  year={2006},
+  publisher={Oldenbourg Wissenschaftsverlag}
+}
+
+@article{textor2011dagitty,
+  title={DAGitty: a graphical tool for analyzing causal diagrams},
+  author={Textor, Johannes and Hardt, Juliane and Kn{\"u}ppel, Sven},
+  journal={Epidemiology},
+  volume={22},
+  number={5},
+  pages={745},
+  year={2011},
+  publisher={LWW}
+}
+
+@article{coyle2018origami,
+  author = {Coyle, Jeremy R and Hejazi, Nima S},
+  title = {origami: A Generalized Framework for Cross-Validation in R},
+  journal = {The Journal of Open Source Software},
+  volume = {3},
+  number = {21},
+  month = {January},
+  year  = {2018},
+  publisher = {The Open Journal},
+  doi = {10.21105/joss.00512},
+  url = {https://doi.org/10.21105/joss.00512}
+}
+
+@article{hejazi2020nonparametric,
+  title={Nonparametric causal mediation analysis for stochastic interventional
+    (in) direct effects},
+  author={Hejazi, Nima S and Rudolph, Kara E and van der Laan, Mark J and
+    D{\'\i}az, Iv{\'a}n},
+  journal={Biostatistics},
+  doi = {10.1093/biostatistics/kxac002},
+  url = {https://arxiv.org/abs/2009.06203},
+  year = {2022},
+  publisher = {Oxford University Press},
+  journal = {Biostatistics},
+  volume = {(in press)}
+}
+
+@article{coyle2021targeted,
+  doi = {},
+  url = {https://arxiv.org/abs/2006.07333},
+  year = {2021},
+  publisher = {},
+  journal = {},
+  volume = {},
+  number = {},
+  pages = {},
+  author = {Coyle, Jeremy R and Hejazi, Nima S and Malenica, Ivana and
+    Phillips, Rachael V and Arnold, Benjamin F and Mertens, Andrew and
+    Benjamin-Chung, Jade and Cai, Weixin and Dayal, Sonali and {Colford Jr.},
+    John M and Hubbard, Alan E and {van der Laan}, Mark J},
+  title = {{Targeting Learning}: Robust statistics for reproducible research}
+}
+
+@misc{bengtsson2020unifying,
+  author = {Bengtsson, Henrik},
+  title = {A Unifying Framework for Parallel and Distributed Processing in R
+    using Futures},
+  year = {2020},
+  month = {8},
+  eprint = {2008.00553},
+  archiveprefix = {arXiv},
+  primaryclass = {cs.DC},
+  url = {https://arxiv.org/abs/2008.00553},
+}
+
+@article{breiman2001random,
+  title={Random forests},
+  author={Breiman, Leo},
+  journal={Machine learning},
+  volume={45},
+  number={1},
+  pages={5--32},
+  year={2001},
+  publisher={Springer}
+}
+
+@article{vanderweele2016mediation,
+  title={Mediation analysis: a practitioner's guide},
+  author={VanderWeele, Tyler J},
+  journal={Annual review of public health},
+  volume={37},
+  pages={17--32},
+  year={2016},
+  publisher={Annual Reviews}
+}
+
+@manual{hejazi2020medshift,
+  author = {Hejazi, Nima S and D{\'\i}az, Iv{\'a}n},
+  title = {{medshift}: Causal mediation analysis for stochastic
+    interventions},
+  year  = {2020},
+  url = {https://github.com/nhejazi/medshift},
+  note = {R package version 0.1.4}
+}
+
+@article{hejazi2020txshift-joss,
+  author = {Hejazi, Nima S and Benkeser, David C},
+  title = {{txshift}: Efficient estimation of the causal effects of
+    stochastic interventions in {R}},
+  year  = {2020},
+  doi = {10.21105/joss.02447},
+  url = {https://10.21105.joss.02447},
+  journal = {Journal of Open Source Software},
+  publisher = {The Open Journal}
+}
+
+@manual{hejazi2020txshift-rpkg,
+  author = {Hejazi, Nima S and Benkeser, David C},
+  title = {{txshift}: Efficient Estimation of the Causal Effects of
+    Stochastic Interventions},
+  year  = {2020},
+  doi = {10.5281/zenodo.4070042},
+  url = {https://CRAN.R-project.org/package=txshift},
+  note = {R package version 0.3.4}
+}
+
+@manual{coyle2022sl3,
+  author = {Coyle, Jeremy R and Hejazi, Nima S and Malenica, Ivana and
+    Phillips, Rachael V and Sofrygin, Oleg},
+  title = {{sl3}: Modern Pipelines for Machine Learning and {Super
+    Learning}},
+  year = {2022},
+  howpublished = {\url{https://github.com/tlverse/sl3}},
+  note = {{R} package version 1.4.5},
+  url = {https://doi.org/10.5281/zenodo.1342293},
+  doi = {10.5281/zenodo.1342293}
+}
+
+@book{vdl2022targeted,
+  doi = {},
+  url = {https://tlverse.org/tlverse-handbook},
+  year = {2022},
+  publisher = {CRC Press},
+  author = {{van der Laan}, Mark J and Coyle, Jeremy R and Hejazi, Nima S and
+    Malenica, Ivana and Phillips, Rachael V and Hubbard, Alan E},
+  title = {{Targeted Learning in \texttt{R}: Causal Data Science with the
+    \texttt{tlverse} Software Ecosystem}},
+  note = {in preparation}
+}
+
+@incollection{phillips2022super,
+  title={Super (machine) learning},
+  author={Phillips, Rachael V},
+  booktitle = {{Targeted Learning in \texttt{R}: Causal Data Science with the
+    \texttt{tlverse} Software Ecosystem}},
+  url = {https://tlverse.org/tlverse-handbook/sl3.html},
+  pages={},
+  year={2022},
+  publisher={Springer}
+}
+
+
+@article{diaz2020nonparametric,
+  title={Non-parametric efficient causal mediation with intermediate
+    confounders},
+  author={D{\'\i}az, Iv{\'a}n and Hejazi, Nima S and Rudolph, Kara E and {van
+    der Laan}, Mark J},
+  year={2020},
+  url = {https://arxiv.org/abs/1912.09936},
+  doi = {10.1093/biomet/asaa085},
+  journal={Biometrika},
+  volume={},
+  number={},
+  pages={},
+  publisher={Oxford University Press}
+}
+
+@article{klaassen1987consistent,
+  title={Consistent estimation of the influence function of locally
+    asymptotically linear estimators},
+  author={Klaassen, Chris AJ},
+  journal={The Annals of Statistics},
+  pages={1548--1562},
+  year={1987},
+  publisher={JSTOR}
+}
+
+@incollection{zheng2011cross,
+  title={Cross-validated targeted minimum-loss-based estimation},
+  author={Zheng, Wenjing and {van der Laan}, Mark J},
+  booktitle={Targeted Learning},
+  pages={459--474},
+  year={2011},
+  publisher={Springer}
+}
+
+@article{chernozhukov2018double,
+  author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and
+    Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James},
+  title = {Double/debiased machine learning for treatment and structural
+    parameters},
+  journal = {The Econometrics Journal},
+  volume = {21},
+  number = {1},
+  year = {2018},
+  month = {01},
+  doi = {10.1111/ectj.12097},
+  url = {https://doi.org/10.1111/ectj.12097}
+}
+
+@article{miles2022causal,
+  title = {On the Causal Interpretation of Randomized Interventional Indirect
+    Effects},
+  author = {Miles, Caleb H},
+  journal = {arXiv preprint arXiv:2203.00245},
+  year = {2022},
+  url = {https://arxiv.org/abs/2203.00245}
+}
+
+@article{hejazi2022medoutcon-joss,
+  author = {Hejazi, Nima S and Rudolph, Kara E and D{\'\i}az,
+    Iv{\'a}n},
+  title = {{medoutcon}: Nonparametric efficient causal mediation
+    analysis with machine learning in {R}},
+  year = {2022},
+  doi = {10.21105/joss.03979},
+  url = {https://doi.org/10.21105/joss.03979},
+  journal = {Journal of Open Source Software},
+  publisher = {The Open Journal}
+}
+
+@software{hejazi2022medoutcon-rpkg,
+  author={Hejazi, Nima S and D{\'\i}az, Iv{\'a}n and Rudolph, Kara E},
+  title = {{medoutcon}: Efficient natural and interventional causal
+    mediation analysis},
+  year  = {2022},
+  doi = {10.5281/zenodo.5809519},
+  url = {https://github.com/nhejazi/medoutcon},
+  note = {R package version 0.1.6}
+}
diff --git a/sessions/mcma_choosing_effects.qmd b/sessions/mcma_choosing_effects.qmd
new file mode 100644
index 0000000..88cc45d
--- /dev/null
+++ b/sessions/mcma_choosing_effects.qmd
@@ -0,0 +1,64 @@
+# How to choose an estimand: Real-world example
+
+## Comparative effectiveness of two medications for opioid use disorder (OUD)
+
+![](../images/ctndag.png)
+
+_Motivation_: Opposite overall treatment effects for homeless versus nonhomeless
+participants. This application was explored in detail by @rudolph2020explaining.
+
+![](../images/tmleesttotal.png)
+
+### Getting specific about the question
+
+To what extent does the indirect effect through mediators of adherence, pain,
+and depressive symptoms explain the differences in treatment effects on OUD
+relapse for homeless and nonhomeless individuals?
+
+#### What estimand do we want? {.unnumbered}
+
+- Can we set $M=m$ (i.e., same value) for everyone?
+- Are we interested in estimating indirect effects?
+
+$\rightarrow$ So, *not* controlled direct effect.
+
+- Do we have an intermediate confounder?
+  + Yes, and it's important.
+
+- Do we have a binary treatment assignment variable and a binary intermediate
+  confounder?
+  + Yes.
+
+- Can we assume monotonicity?
+  + Yes.
+
+$\rightarrow$ So, could estimate natural (in)direct effects under monotonicity.
+
+What if we don't want to assume monotonicity, or if we do not have a binary
+treatment assignment variable and binary intermediate confounder?
+
+$\rightarrow$ Interventional direct and indirect effects.
+
+- Do we want to estimate the path through treatment initiation ($Z$)?
+  + Yes, so, _not_ the conditional versions of these effects.
+  + Estimands:
+    - Direct effect: $\E(Y_{1,G_0} - Y_{0,G_0})$
+    - Indirect effect: $\E(Y_{1,G_1} - Y_{1,G_0})$
+  + Here $G_a$ is a draw from the distribution of $M_a\mid W$.
+- Need to incorporate multiple and continuous mediators
+
+#### What if positivity assumption $\P(A=a\mid W)>0$ is violated? {.unnumbered}
+
+$\rightarrow$ Can't identify or estimate any of the above effects
+
+- But we can estimate the effect of some stochastic interventions, e.g., IPSIs
+- Trade-off between feasibility and interpretation
+
+#### What if the exposure variable is continuous? {.unnumbered}
+
+$\rightarrow$ All the above effects are defined for binary exposures
+
+- But we can estimate the effect of some stochastic interventions
+- Work in progress (including upcoming R software)
+
+#### What if the exposure is actually time-varying? What if the mediators and/or intermediate confounders are time-varying? {.unnumbered}
diff --git a/sessions/mcma_effects_defn.qmd b/sessions/mcma_effects_defn.qmd
new file mode 100644
index 0000000..c091e1d
--- /dev/null
+++ b/sessions/mcma_effects_defn.qmd
@@ -0,0 +1,654 @@
+# Types of path-specific causal mediation effects
+
+- Controlled direct effects
+- Natural direct and indirect effects
+- Interventional direct and indirect effects
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under *no intermediate confounders* of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+## Controlled direct effects
+
+$$\psi_{\text{CDE}} = \E(Y_{1,m} - Y_{0,m})$$
+
+![](../images/graphic4a3.png){width=40%}
+
+- Set the mediator to a reference value $M=m$ uniformly for everyone in the
+  population
+- Compare $A=1$ vs $A=0$ with $M=m$ fixed
+
+### Identification assumptions:
+- Confounder assumptions:
+  + $A \indep Y_{a,m} \mid W$
+  + $M \indep Y_{a,m} \mid W, A$
+- Positivity assumptions:
+  + $\P(M = m \mid A=a, W) > 0 \text{  } a.e.$
+  + $\P(A=a \mid W) > 0 \text{  } a.e.$
+
+Under the above identification assumptions, the controlled direct effect can be
+identified:
+$$
+  \E(Y_{1,m} - Y_{0,m}) = \E\{ \color{ForestGreen}{\E(Y \mid A=1, M=m, W) -
+    \E(Y \mid A=0, M=m, W)} \}
+$$
+
+- For intuition about this formula in R, let's continue with a toy example:
+  ```{r}
+  n <- 1e6
+  w <- rnorm(n)
+  a <- rbinom(n, 1, 0.5)
+  m <- rnorm(n, w + a)
+  y <- rnorm(n, w + a + m)
+  ```
+- First we fit a correct model for the outcome
+  ```{r}
+  lm_y <- lm(y ~ m + a + w)
+  ```
+- Assume we would like the CDE at $m=0$
+- Then we generate predictions
+  $$
+    \color{ForestGreen}{\E(Y \mid A=1, M=m, W)} \text{ and }
+      \color{ForestGreen}{\E(Y \mid A=0, M=m, W)} \ :
+  $$
+  ```{r}
+  pred_y1 <- predict(lm_y, newdata = data.frame(a = 1, m = 0, w = w))
+  pred_y0 <- predict(lm_y, newdata = data.frame(a = 0, m = 0, w = w))
+  ```
+- Then we compute the difference between the predicted values
+  $\color{ForestGreen}{\E(Y \mid A=1, M=m, W) - \E(Y \mid A=0, M=m, W)}$, and
+  average across values of $W$
+  ```{r}
+  ## CDE at m = 0
+  mean(pred_y1 - pred_y0)
+  ```
+
+### Is this the estimand I want?
+
++ Makes the most sense if can intervene directly on $M$
+  + And can think of a policy that would set everyone to a constant level $m
+    \in \mathcal{M}$.
+  + Judea Pearl calls this _prescriptive_.
+  + Can you think of an example? (Air pollution, rescue inhaler dosage,
+    hospital visits...)
+  + Does not provide a decomposition of the average treatment effect into
+    direct and indirect effects.
+
+_What if our research question doesn't involve intervening directly on the
+mediator?_
+
+_What if we want to decompose the average treatment effect into its direct and
+indirect counterparts?_
+
+## Natural direct and indirect effects
+
+Still using the same DAG as above,
+
+- Recall the definition of the nested counterfactual:
+  \begin{equation*}
+    Y_{1, M_0} = f_Y(W, 1, M_0, U_Y)
+  \end{equation*}
+
+- Interpreted as _the outcome for an individual in a hypothetical world where
+  treatment was given but the mediator was held at the value it would have
+  taken under no treatment_
+
+  ![](../images/graphic4a.png){width=40%}
+
+- Recall that, because of the definition of counterfactuals
+  \begin{equation*}
+    Y_{1, M_1} = Y_1
+  \end{equation*}
+
+Then we can decompose the _average treatment effect_ $\E(Y_1-Y_0)$ as follows
+
+\begin{equation*}
+\E[Y_{1,M_1} - Y_{0,M_0}] = \underbrace{\E[Y_{\color{red}{1},\color{blue}{M_1}}
+  - Y_{\color{red}{1},\color{blue}{M_0}}]}_{\text{natural indirect effect}} +
+  \underbrace{\E[Y_{\color{blue}{1},\color{red}{M_0}} -
+  Y_{\color{blue}{0},\color{red}{M_0}}]}_{\text{natural direct effect}}
+\end{equation*}
+
+- Natural direct effect (NDE): Varying treatment while keeping the mediator
+  fixed at the value it would have taken under no treatment
+- Natural indirect effect (NIE): Varying the mediator from the value it would
+  have taken under treatment to the value it would have taken under control,
+  while keeping treatment fixed
+
+### Identification assumptions:
+
+- $A \indep Y_{a,m} \mid W$
+- $M \indep Y_{a,m} \mid W, A$
+- $A \indep M_a \mid W$
+- $M_0 \indep Y_{1,m} \mid W$
+- and positivity assumptions
+
+### Cross-world independence assumption
+
+What does $M_0 \indep Y_{1,m} \mid W$ mean?
+
+- Conditional on $W$, knowledge of the mediator value in the absence of
+  treatment, $M_0$,
+  provides no information about the outcome under treatment, $Y_{1,m}$.
+- Can you think of a data-generating mechanism that would violate this
+  assumption?
+- Example: in a randomized study, whenever we believe that treatment assignment
+  works through adherence (i.e., almost
+  always), we are violating this assumption (more on this later).
+- Cross-world assumptions are problematic for other reasons, including:
+  - You can never design a randomized study where the assumption holds by
+    design.
+
+**If the cross-world assumption holds, can write the NDE as a weighted average
+of controlled direct effects at each level of $M=m$.**
+
+$$
+  \E \sum_m \{\E(Y_{1,m} \mid W) - \E(Y_{0,m} \mid W)\} \P(M_{0}=m \mid W)
+$$
+
+- If CDE($m$) is constant across $m$, then CDE = NDE.
+
+### Identification formula:
+
+- Under the above identification assumptions, the natural direct effect can be
+  identified:
+  \begin{equation*}
+    \E(Y_{1,M_0} - Y_{0,M_0}) =
+    \E[\color{Goldenrod}{\E\{}\color{ForestGreen}{\E(Y \mid A=1, M, W) -
+    \E(Y \mid A=0, M, W)}\color{Goldenrod}{\mid A=0,W\}}]
+  \end{equation*}
+
+- The natural indirect effect can be identified similarly.
+- Let's dissect this formula in `R`:
+  ```{r}
+  n <- 1e6
+  w <- rnorm(n)
+  a <- rbinom(n, 1, 0.5)
+  m <- rnorm(n, w + a)
+  y <- rnorm(n, w + a + m)
+  ```
+- First we fit a correct model for the outcome
+  ```{r}
+  lm_y <- lm(y ~ m + a + w)
+  ```
+
+- Then we generate predictions $\color{ForestGreen}{\E(Y \mid A=1, M, W)}
+  \text{ and }\color{ForestGreen}{\E(Y \mid A=0, M, W)}$ with $A$ fixed but
+  letting $M$ and $W$ take their observed values
+  ```{r}
+  pred_y1 <- predict(lm_y, newdata = data.frame(a = 1, m = m, w = w))
+  pred_y0 <- predict(lm_y, newdata = data.frame(a = 0, m = m, w = w))
+  ```
+- Then we compute the difference between the predicted values
+  $\color{ForestGreen}{\E(Y \mid A=1, M, W) - \E(Y \mid A=0, M, W)},$
+- and use this difference as a pseudo-outcome in a regression on $A$ and $W$:
+  $\color{Goldenrod}{\E\{}\color{ForestGreen}{\E(Y \mid A=1, M, W) - \E(Y \mid
+  A=0, M, W)}\color{Goldenrod}{\mid A=0,W\}}$
+  ```{r}
+  pseudo <- pred_y1 - pred_y0
+  lm_pseudo <- lm(pseudo ~ a + w)
+  ```
+
+- Now we predict the value of this pseudo-outcome under $A=0$, and average the
+  result
+  ```{r}
+  pred_pseudo <- predict(lm_pseudo, newdata = data.frame(a = 0, w = w))
+  ## NDE:
+  mean(pred_pseudo)
+  ```
+
+### Is this the estimand I want?
+
+- Makes sense to intervene on $A$ but not directly on $M$.
+- Want to understand a natural mechanism underlying an association / total
+  effect. J. Pearl calls this *descriptive*.
+- NDE + NIE = total effect (ATE).
+- Okay with the assumptions.
+
+_What if our data structure involves a post-treatment confounder of the
+mediator-outcome relationship (e.g., adherence)?_
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under intermediate confounders of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{0, -1}{Z}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \ArrowR{W}{Z}{black}
+  \Arrow{Z}{M}{black}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \Arrow{A}{Z}{black}
+  \Arrow{Z}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+![](../images/ctndag.png)
+
+### Unidentifiability of the NDE and NIE in this setting
+
+- In this example, natural direct and indirect effects are not generally point
+  identified from observed data $O=(W,A,Z,M,Y)$.
+- The reason for this is that the cross-world counterfactual
+  assumption
+  $$
+    Y_{1,m} \indep M_0 \mid W
+  $$
+  does not hold in the above directed acyclic graph.
+
+- To give intuition, we focus on the counterfactual outcome $Y_{A=1, M_{A=0}}$.
+  - This counterfactual outcome involves two counterfactual worlds
+    simultaneously: one in which $A=1$ for the first portion of the
+    counterfactual outcome, and one in which $A=0$ for the nested portion of the
+    counterfactual outcome.
+  - Setting $A=1$ induces a counterfactual treatment-induced confounder, denoted
+    $Z_{A=1}$. Setting $A=0$ induces another counterfactual treatment-induced
+    confounder, denoted $Z_{A=0}$.
+  - The two treatment-induced counterfactual confounders, $Z_{A=1}$ and
+    $Z_{A=0}$ share unmeasured common causes, $U_Z$, which creates a spurious
+    association.
+  - Because $Z_{A=1}$ is causally related to $Y_{A=1, M=m}$, and $Z_{A=0}$ is
+    also casually related to $M_{A=0}$, the path through $U_Z$ means that the
+    backdoor criterion is not met for identification of $Y_{A=1, M_{A=0}}$,
+    i.e., $M_{0} \notindep Y_{A=1, m} \mid W$, where $W$ denotes
+    baseline covariates.
+
+
+```{tikz}
+#| echo: false
+#| fig-cap: Parallel worlds model of the scenario considered
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+
+\begin{tikzpicture}
+  \Vertex{0, 1}{Z_0}
+  \Vertex{0, -1}{Z_1}
+  \Vertex{-2, 0}{U_Z}
+  \Vertex{2, 1}{M_0}
+  \Vertex{4, -1}{Y_{1, m}}
+  \Arrow{Z_0}{M_0}{black}
+  \Arrow{Z_1}{Y_{1, m}}{black}
+  \Arrow{U_Z}{Z_0}{black}
+  \Arrow{U_Z}{Z_1}{black}
+\end{tikzpicture}
+```
+
+<!--
+this is what we used to have
+- Technically, the reason for this is that an intervention setting $A=1$
+  (necessary for the definition of $Y_{1,m}$) induces a counterfactual variable
+  $Z_1$.
+- Likewise, an intervention setting $A=0$ (necessary for the definition of
+  $M_0$) induces a counterfactual $Z_0$.
+- The variables $Z_1$ and $Z_0$ are correlated because they share unmeasured
+  common causes, $U_Z$.
+- The variable $Z_1$ is correlated with $Y_{1,m}$, and the variable $Z_0$ is
+  correlated with $M_0$, because they are counterfactual outcomes in the same
+  hypothetical worlds.
+- To see this in the definition of counterfactual from a causal structural
+  model:
+\begin{align*}
+  Y_{1,m} &= f_Y(W, 1, Z_1, m, U_Y), \text{ and }\\
+  M_0 &= f_M(W, 0, Z_0, U_M)\\
+\end{align*}
+are correlated even after adjusting for $W$ by virtue of $Z_1$ and $Z_0$ being
+correlated.
+-->
+
+<!--
+Intuitively:
+- $Z$ is a confounder of the relation $M \rightarrow Y$, which requires
+  adjustment
+- But $Z$ is on the pathway $A\rightarrow Y$, which precludes adjustment
+-->
+
+However:
+
+- We can actually actually identify the NIE/NDE in the above setting if we are
+  willing to invoke monotonicity between a treatment and one or more binary
+  treatment-induced confounders [@tchetgen2014identification].
+- Assuming monotonicity is also sometimes referred to as assuming "no defiers"
+  -- in other words, assuming that there are no individuals who would do the
+  opposite of the encouragement.
+- Monotonicity may seem like a restrictive assumption, but may be reasonable in
+  some common scenarios (e.g., in trials where the intervention is randomized
+  treatment assignment and the treatment-induced confounder is whether or not
+  treatment was actually taken -- in this setting, we may feel comfortable
+  assuming that there are no "defiers", frequently assumed when using IVs to
+  identify causal effects)
+
+Note: CDEs are still identified in this setting. They can be identified and
+estimated similarly to a longitudinal data sructure with a two-time-point
+intervention.
+
+## Interventional (in)direct effects
+
+- Let $G_a$ denote a random draw from the distribution of $M_a \mid W$
+- Define the counterfactual $Y_{1,G_0}$ as the counterfactual
+  variable in a hypothetical world where $A$ is set $A=1$ and $M$ is
+  set to $M=G_0$ with probability one.
+
+![](../images/graphic4b.png){width=60%}
+
+- Define $Y_{0,G_0}$ and $Y_{1,G_1}$ similarly
+- Then we can define:
+  $$
+    \E[Y_{1,G_1} - Y_{0,G_0}] = \underbrace{\E[Y_{\color{red}{1},
+      \color{blue}{G_1}} - Y_{\color{red}{1},
+      \color{blue}{G_0}}]}_{\text{interventional indirect effect}} +
+      \underbrace{\E[Y_{\color{blue}{1},\color{red}{G_0}} -
+      Y_{\color{blue}{0},
+      \color{red}{G_0}}]}_{\text{interventional direct effect}}
+  $$
+
+- Note that $\E[Y_{1,G_1} - Y_{0,G_0}]$ is still a _total effect_ of treatment,
+  even if it is different from the ATE $\E[Y_{1} - Y_{0}]$
+- We gain in the ability to solve a problem, but lose in terms of interpretation
+  of the causal effect (cannot decompose the ATE)
+
+### An alternative definition of the effects:
+
+- Above  we defined $G_a$ as a random draw from the distribution of $M_a \mid W$
+- What if instead we define $G_a$ as a random draw from the distribution of $M_a
+  \mid (Z_a,W)$
+- It turns out the indirect effect defined in this way only measures the path
+  $A\rightarrow M \rightarrow Y$, and not the path $A\rightarrow Z\rightarrow M
+  \rightarrow Y$
+- There may be important reasons to choose one over another (e.g., survival
+  analyses where we want the distribution conditional on $Z$, instrumental
+  variable designs where it doesn't make sense to condition on $Z$)
+
+<!--
+ + Marginal PIDE: $\E(Y_{a, g_{M \mid a^{\star}, W}}) -
+   \E(Y_{a^{\star}, g_{M \mid a^{\star}, W}})$
+ + Marginal PIIE: $\E(Y_{a, g_{M \mid a, W}}) -
+   \E(Y_{a, g_{M \mid a^{\star}, W}})$
+ + Conditional PIDE: $\E(Y_{a, g_{M \mid Z, a^{\star}, W}}) -
+   \E(Y_{a^{\star}, g_{M \mid Z, a^{\star}, W}})$
+ + Conditional PIIE: $\E(Y_{a, g_{M \mid Z, a, W}}) -
+   \E(Y_{a, g_{M \mid Z, a^{\star}, W}})$
+ + Can you think of an example when you would want the conditional versions?
+   Marginal versions?
+-->
+
+### Identification assumptions:
+
+- $A \indep Y_{a,m} \mid W$
+- $M \indep Y_{a,m} \mid W, A, Z$
+- $A \indep M_a \mid W$
+- and positivity assumptions.
+
+Under these assumptions, the population interventional direct and indirect
+effect is identified:
+\begin{align*}
+  \E &(Y_{a, G_{a'}}) = \\
+     & \E\left[\color{Purple}{\E\left\{\color{Goldenrod}{\sum_z}
+      \color{ForestGreen}{\E(Y \mid A=a, Z=z, M, W)}
+      \color{Goldenrod}{\P(Z=z \mid A=a, W)}\mid A=a', W\right\}} \right]
+\end{align*}
+
+- Let's dissect this formula in `R`:
+  ```{r}
+  n <- 1e6
+  w <- rnorm(n)
+  a <- rbinom(n, 1, 0.5)
+  z <- rbinom(n, 1, 0.5 + 0.2 * a)
+  m <- rnorm(n, w + a - z)
+  y <- rnorm(n, w + a + z + m)
+  ```
+
+- Let us compute $\E(Y_{1, G_0})$ (so that $a = 1$, and $a'=0$).
+- First, fit a regression model for the outcome, and compute
+  $\color{ForestGreen}{\E(Y \mid A=a, Z=z, M, W)}$ for all values of $z$
+  ```{r}
+  lm_y <- lm(y ~ m + a + z + w)
+  pred_a1z0 <- predict(lm_y, newdata = data.frame(m = m, a = 1, z = 0, w = w))
+  pred_a1z1 <- predict(lm_y, newdata = data.frame(m = m, a = 1, z = 1, w = w))
+  ```
+
+- Now we fit the true model for $Z \mid A, W$ and get the conditional
+  probability that $Z=1$ fixing $A=1$
+  ```{r}
+  prob_z <- lm(z ~ a)
+  pred_z <- predict(prob_z, newdata = data.frame(a = 1))
+  ```
+
+- Now we compute the following pseudo-outcome:
+  $\color{Goldenrod}{\sum_z}\color{ForestGreen}{\E(Y \mid A=a, Z=z, M, W)}
+   \color{Goldenrod}{\P(Z=z \mid A=a, w)}$
+  ```{r}
+  pseudo_out <- pred_a1z0 * (1 - pred_z) + pred_a1z1 * pred_z
+  ```
+
+- Now we regress this pseudo-outcome on $A,W$, and compute the predictions
+  setting $A=0$, that is, \[\color{Purple}{\E\left\{\color{Goldenrod}{\sum_z}
+  \color{ForestGreen}{\E(Y \mid A=a, Z=z, M, W)}
+  \color{Goldenrod}{\P(Z=z \mid A=a, w)}\mid A=a', W\right\}}\]
+  ```{r}
+  fit_pseudo <- lm(pseudo_out ~ a + w)
+  pred_pseudo <- predict(fit_pseudo, data.frame(a = 0, w = w))
+  ```
+
+- And finally, just average those predictions!
+  ```{r}
+  ## Mean(Y(1, G(0)))
+  mean(pred_pseudo)
+  ```
+
+- This was for $(a,a')=(1,0)$. Can do the same with $(a,a')=(1,1)$, and
+  $(a,a')=(0,0)$ to obtain an effect decomposition
+
+  \begin{equation*}
+    \E[Y_{1,G_1} - Y_{0,G_0}] = \underbrace{\E[Y_{\color{red}{1},
+      \color{blue}{G_1}} -
+      Y_{\color{red}{1},
+      \color{blue}{G_0}}]}_{\text{interventional indirect effect}} +
+      \underbrace{\E[Y_{\color{blue}{1},\color{red}{G_0}} -
+      Y_{\color{blue}{0},
+      \color{red}{G_0}}]}_{\text{interventional direct effect}}
+  \end{equation*}
+
+### Is this the estimand I want?
+
+- Makes sense to intervene on $A$ but not directly on $M$.
+- Goal is to understand a descriptive type of mediation.
+- Okay with the assumptions!
+
+### But, there is an important limitation of interventional effects
+
+@miles2022causal recently uncovered an important limitation of these effects,
+which can be described as follows. The sharp mediational hull hypothesis can be
+defined as
+
+$$
+  H_0:Y(a, M(a')) = Y(a, M(a^\star));\text{ for all }a, a', a^\star \ .
+$$
+
+The problem is that interventional effects are not guaranteed to be null when
+the sharp mediational hypothesis is true.
+
+This could present a problem in practice if some subgroup of the population has
+a relationship between $A$ and $M$, but not between $M$ and $Y$. Then, another
+distinct subgroup of the population has a relationship between $M$ and $Y$ but
+not between $A$ and $M$.  In such a scenario, the interventional indirect effect
+would be nonzero, but there would be no one person in the population whose
+effect of $A$ on $Y$ would be mediated by $M$.
+
+<!--
+One counterexample involves creating an NPSEM with independent errors,
+$W=\emptyset$, and an exogenous binary variable $U_Z$ such that $M(a)
+= M(a')$ in the event $U_Z=1$ and $Y(a,m') = Y(a,m)$ in the event
+$U_Z=0$ for all $a$, $a'$, and $m$, such that there is no causal
+influence of $A$ on $Y$ operating through $M$. It is easy to see that
+the randomized interventional indirect effect in this example equals
+
+\[\sum_{z,m} E(Y\mid m, z,a) P(Z=z\mid a)[P(M=m\mid a)-P(M=m\mid
+a')],\]
+
+which is generally not null, and would only be null if $U_Z$
+was observed and conditioned upon in all the above quantities.
+-->
+
+More details in the original paper.
+
+## Estimand Summary
+
+![](../images/table1.png)
diff --git a/sessions/mcma_estim_natural_interv.qmd b/sessions/mcma_estim_natural_interv.qmd
new file mode 100644
index 0000000..4c99d69
--- /dev/null
+++ b/sessions/mcma_estim_natural_interv.qmd
@@ -0,0 +1,636 @@
+# Construction of estimators: Example with the NDE
+
+## Recap of definition and identification of the natural direct effect
+
+Recall:
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under *no intermediate confounders* of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+- Assuming a binary $A$, we define the natural direct effect as:
+  $\text{NDE} = \E(Y_{1,M_{0}} - Y_{0,M_{0}})$.
+- and the natural indirect effect as:
+  $\text{NIE} = \E(Y_{1,M_{1}} - Y_{1,M_{0}})$.
+- The observed data is $O = (W, A, M, Y)$
+
+This SCM is represented in the above DAG and the following causal models:
+\begin{align*}
+  W & = f_W(U_W) \\
+  A & = f_A(W, U_A) \\
+  M & = f_M(W, A, U_M) \\
+  Y & = f_Y(W, A, M, U_Y),
+\end{align*}
+where $(U_W, U_A,U_M, U_Y)$ are exogenous random errors.
+
+Recall that we need to assume the following to identify the above causal
+effects from our observed data:
+
+- $A \indep Y_{a,m} \mid W$
+- $M \indep Y_{a,m} \mid W, A$
+- $A \indep M_a \mid W$
+- $M_0 \indep Y_{1,m} \mid W$
+- and positivity assumptions
+
+Then, the NDE is identified as
+  $$
+    \psi(\P) = \E[\E\{\E(Y \mid A=1, M, W) - \E(Y \mid A=0, M, W) \mid A=0,W\}]
+  $$
+
+## From causal to statistical quantities
+- We have arrived at identification formulas that express quantities that we
+  care about in terms of observable quantities
+- That is, these formulas express what would have happened in hypothetical
+  worlds in terms of quantities observable in this world.
+- This required **causal assumptions**
+  - Many of these assumptions are empirically unverifiable
+  - We saw an example where we could relax the cross-world assumption, at the
+    cost of changing the parameter interpretation (when we introduced randomized
+    interventional direct and indirect effects).
+  - We also include an extra section at the end about **stochastic** randomized
+    interventional direct and indirect effects, which allow us to relax the
+    positivity assumption, also at the cost of changing the parameter
+    interpretation.
+- We are now ready to tackle the estimation problem, i.e., how do we best learn
+  the value of quantities that are observable?
+- The resulting estimation problem can be tackled using **statistical
+  assumptions** of various degrees of strength
+  - Most of these assumptions are verifiable (e.g., a linear model)
+  - Thus, most are unnecessary (except for convenience)
+  - We have worked hard to try to satisfy the required causal assumptions
+  - This is not the time to introduce unnecessary statistical assumptions
+  - The estimation approach we will minimizes reliance on these statistical
+    assumptions.
+
+## Computing identification formulas if you know the true distribution
+
+- The mediation parameters that we consider can be seen as a function of the
+  joint probability distribution of observed data $O=(W,A,Z,M,Y)$
+- For example, under identifiability assumptions the natural direct effect is
+  equal to
+  $$
+    \psi(\P) = \E[\color{Goldenrod}{\E\{\color{ForestGreen}
+      {\E(Y \mid A=1, M, W) - \E(Y \mid A=0, M, W)} \mid A=0, W \}}]
+  $$
+- The notation $\psi(\P)$ means that the parameter is a function of $\P$ -- in
+  other words, that it is a function of this joint probability distribution
+- This means that we can compute it for any distribution $\P$
+- For example, if we know the true $\P(W,A,M,Y)$, we can comnpute the true value
+  of the parameter by:
+  - Computing the conditional expectation $\E(Y\mid A=1,M=m,W=w)$ for all values
+    $(m,w)$
+  - Computing the conditional expectation $\E(Y\mid A=0,M=m,W=w)$ for all values
+    $(m,w)$
+  - Computing the probability $\P(M=m\mid A=0,W=w)$ for all values $(m,w)$
+  - Compute
+    \begin{align*}
+      \color{Goldenrod}{\E\{}&\color{ForestGreen}{\E(Y \mid A=1, M, W) -
+        \E(Y \mid A=0, M, W)}\color{Goldenrod}{\mid A=0,W\}} =\\
+      &\color{Goldenrod}{\sum_m\color{ForestGreen}{\{\E(Y \mid A=1, m, w) -
+        \E(Y \mid A=0, m, w)\}} \P(M=m\mid A=0, W=w)}
+    \end{align*}
+  - Computing the probability $\P(W=w)$ for all values $w$
+  - Computing the mean over all values $w$
+
+
+## Plug-in (G-computation) estimator
+
+The above is how you would compute the *true value* **if you know** the true
+distribution $\P$
+
+- This is exactly what we did in our R examples before
+- But we can use the same logic for estimation:
+  - Fit a regression to estimate, say $\hat\E(Y\mid A=1,M=m,W=w)$
+  - Fit a regression to estimate, say $\hat\E(Y\mid A=0,M=m,W=w)$
+  - Fit a regression to estimate, say $\hat\P(M=m\mid A=0,W=w)$
+  - Estimate $\P(W=w)$ with the empirical distribution
+  - Evaluate
+    $$
+      \psi(\hat\P) = \hat{\E}[\color{RoyalBlue}{
+        \hat{\E}\{\color{Goldenrod}{\hat\E(Y \mid A=1, M, W) -
+        \hat{\E}(Y \mid A=0, M, W)}\mid A=0,W\}}]
+    $$
+- This is known as the G-computation estimator.
+
+## First weighted estimator (akin to IPW)
+
+- An alternative expression of the parameter functional (for the NDE) is given
+  by
+  $$
+    \E \bigg[\color{RoyalBlue}{\bigg\{ \frac{\I(A=1)}{\P(A=1\mid W)}
+    \frac{\P(M\mid A=0,W)}{\P(M\mid A=1,W)} -
+    \frac{\I(A=0)}{\P(A=0\mid W)}\bigg\}} \times \color{Goldenrod}{Y}\bigg]
+  $$
+- Thus, you can also construct a weighted estimator as
+  $$
+    \frac{1}{n} \sum_{i=1}^n \bigg[\color{RoyalBlue}{\bigg\{
+    \frac{\I(A_i=1)}{\hat{\P}(A_i=1\mid W_i)}
+    \frac{\hat{\P}(M_i\mid A_i=0,W_i)}{\hat{\P}(M_i\mid A_i=1, W_i)} -
+    \frac{\I(A_i=0)}{\hat{\P}(A_i=0\mid W_i)}\bigg\}} \times
+    \color{Goldenrod}{Y_i} \bigg]
+  $$
+
+## Second weighted estimator
+
+- The parameter functional for the NDE can also be expressed as a combination of
+  regression and weighting:
+  $$
+    \E\bigg[\color{RoyalBlue}{\frac{\I(A=0)}{\P(A=0\mid W)}}
+    \times \color{Goldenrod}{\E(Y \mid A=1, M, W) -
+    \E(Y \mid A=0, M, W)}\bigg]
+  $$
+
+- Thus, you can also construct a weighted estimator as
+  $$
+    \frac{1}{n} \sum_{i=1}^n \bigg[\color{RoyalBlue}{
+    \frac{\I(A_i=0)}{\hat{\P}(A_i=0\mid W_i)}} \times
+    \color{Goldenrod}{\hat{\E}(Y \mid A=1, M_i, W_i) -
+    \hat{\E}(Y \mid A=0, M_i, W_i)}\bigg]
+  $$
+
+## How can g-estimation and weighted estimation be implemented in practice?
+
+- There are two possible ways to do G-computation or weighted estimation:
+  - Using parametric models for the above regressions
+  - Using flexible data-adaptive regression (aka machine learning)
+
+## Pros and cons of G-computation and weighting parametric models
+
+- Pros:
+  - Easy to understand
+  - Ease of implementation (standard regression software)
+  - Can use the Delta method or the bootstrap for computation of standard errors
+
+- Cons:
+  - Unless $W$ and $M$ contain very few categorical variables, it is very easy
+    to misspecify the models
+  - This can introduce sizable bias in the estimators
+  - This modelling assumptions have become less necessary in the presence of
+    data-adaptive regression tools (a.k.a., machine learning)
+
+## An example of the bias of a g-computation estimator of the natural direct effect
+
+- The following `R` chunk provides simulation code to exemplify the bias of a
+  G-computation parametric estimator in a simple situation
+
+  ```{r}
+  mean_y <- function(m, a, w) abs(w) + a * m
+  mean_m <- function(a, w) plogis(w^2 - a)
+  pscore <- function(w) plogis(1 - abs(w))
+  ```
+
+  ```{r}
+  #| echo: false
+  #| results: hide
+  w_big <- runif(1e6, -1, 1)
+  trueval <- mean((mean_y(1, 1, w_big) - mean_y(1, 0, w_big)) *
+    mean_m(0, w_big) + (mean_y(0, 1, w_big) - mean_y(0, 0, w_big)) *
+      (1 - mean_m(0, w_big)))
+  print(trueval)
+  ```
+
+- This yields a true NDE value of `r round(trueval, 4)`
+
+- Let's perform a simulation where we draw 1000 datasets from the above
+  distribution, and compute a g-computation estimator based on
+
+  ```{r}
+  gcomp <- function(y, m, a, w) {
+    lm_y <- lm(y ~ m + a + w)
+    pred_y1 <- predict(lm_y, newdata = data.frame(a = 1, m = m, w = w))
+    pred_y0 <- predict(lm_y, newdata = data.frame(a = 0, m = m, w = w))
+    pseudo <- pred_y1 - pred_y0
+    lm_pseudo <- lm(pseudo ~ a + w)
+    pred_pseudo <- predict(lm_pseudo, newdata = data.frame(a = 0, w = w))
+    estimate <- mean(pred_pseudo)
+    return(estimate)
+  }
+  ```
+
+  ```{r}
+  estimate <- lapply(seq_len(1000), function(iter) {
+    n <- 1000
+    w <- runif(n, -1, 1)
+    a <- rbinom(n, 1, pscore(w))
+    m <- rbinom(n, 1, mean_m(a, w))
+    y <- rnorm(n, mean_y(m, a, w))
+    est <- gcomp(y, m, a, w)
+    return(est)
+  })
+  estimate <- do.call(c, estimate)
+
+  hist(estimate)
+  abline(v = trueval, col = "red", lwd = 4)
+  ```
+
+- The bias also affects the confidence intervals:
+
+  ```{r}
+  #| fig-width: 8
+  cis <- cbind(
+    estimate - qnorm(0.975) * sd(estimate),
+    estimate + qnorm(0.975) * sd(estimate)
+  )
+
+  ord <- order(rowSums(cis))
+  lower <- cis[ord, 1]
+  upper <- cis[ord, 2]
+  curve(trueval + 0 * x,
+    ylim = c(0, 1), xlim = c(0, 1001), lwd = 2, lty = 3, xaxt = "n",
+    xlab = "", ylab = "Confidence interval", cex.axis = 1.2, cex.lab = 1.2
+  )
+  for (i in 1:1000) {
+    clr <- rgb(0.5, 0, 0.75, 0.5)
+    if (upper[i] < trueval || lower[i] > trueval) clr <- rgb(1, 0, 0, 1)
+    points(rep(i, 2), c(lower[i], upper[i]), type = "l", lty = 1, col = clr)
+  }
+  text(450, 0.10, "n=1000 repetitions = 1000 ", cex = 1.2)
+  text(450, 0.01, paste0(
+    "Coverage probability = ",
+    mean(lower < trueval & trueval < upper), "%"
+  ), cex = 1.2)
+  ```
+
+## Pros and cons of G-computation or weighting with data-adaptive regression
+
+- Pros:
+  - Easy to understand.
+  - Alleviate model-misspecification bias.
+
+- Cons:
+  - Might be harder to implement depending on the regression procedures used.
+  - No general approaches for computation of standard errors and confidence
+    intervals.
+  - For example, the bootstrap is not guaranteed to work, and it is known to
+    fail in some cases.
+
+
+## Solution to these problems: robust semiparametric efficient estimation
+
+- Intuitively, it combines the three above estimators to obtain an
+  estimator with improved robustness properties
+- It offers a way to use data-adaptive regression to
+  - avoid model misspecification bias,
+  - endow the estimators with additional robustness (e.g., multiple robustness),
+    while
+  - allowing the computation of correct standard errors and confidence
+  intervals using Gaussian approximations
+
+<!--
+ - This can be achieved by adding a bias correction factor the the
+   G-computation as follows:
+   \begin{equation*}
+     \psi(\hat \P) + \frac{1}{n}\sum_{i=1}^n D(O_i)
+   \end{equation*}
+   for some function $D(O_i)$ of the data
+ - The function $D(O)$ is called _the efficient influence function_ (EIF)
+ - The EIF must be found on a case-by-case basis for each parameter $\psi(\P)$
+ - For example, for estimating the standardized mean $\psi(\P)=\E[\E(Y\mid A=1,
+   W)]$, we have
+   \begin{equation*}
+     D(O) = \frac{A}{\hat \P(A=1\mid W)}[Y - \hat\E(Y\mid A=1, W)] +
+     \hat\E(Y\mid A=1, W) - \psi(\hat\P)
+   \end{equation*}
+ - The EIF is found by using a distributional analogue of a Taylor expansion
+ - In this workshop we will omit the specific form of $D(O)$ for
+   some of the parameters that we use
+ - But the estimators we discuss and implement in the R packages will be based
+   on these EIFs
+ - And the specific form of the EIF may be found in papers in the references
+
+ Note: the bias correction above may have an additional problem of
+ returning parameter estimates outside of natural bounds. For example,
+ probabilities greater than one. A solution to this (not discussed in
+ this workshop but implemented in some of the R packages) is targeted
+ minimum loss-based estimation.
+-->
+
+# Construction of a semiparametric efficient estimator for the NDE (a.k.a. the one-step estimator)
+
+<!--
+- For illustration, we will first present how to construct an estimator of the
+  NDE that uses the EIF "by hand"
+-->
+- Here we show the detail of how to construct an estimator for the NDE for
+  illustration, but the construction of this estimator is a bit involved and may
+  be complex in daily research practice
+- For practice, we will teach you how to use our packages _medoutcon_ (and
+  _medshift_, as detailed in the extra material) for automatic implementation of
+  these estimators of the NDE and other parameters
+
+First, we need to introduce some notation to describe the EIF for the NDE
+
+- Let $Q(M, W)$ denote $\E(Y\mid A=1, M, W) - \E(Y\mid A=0, M, W)$
+- We can now introduce the semiparametric efficient estimator:
+
+\begin{align*}
+    \hat{\psi} &= \frac{1}{n} \sum_{i=1}^n \color{RoyalBlue}{\bigg\{
+      \frac{\I(A_i=1)}{\hat{\P}(A_i=1 \mid W_i)}
+      \frac{\hat{\P}(M_i \mid A_i=0,W)_i}{\hat{\P}(M_i \mid A_i=1,W_i)} -
+      \frac{\I(A=0)}{\hat{\P}(A_i=0 \mid W_i)}\bigg\}}
+      \color{Goldenrod}{[Y_i - \hat{\E}(Y\mid A_i,M_i,W_i)]} \\
+    &+ \frac{1}{n} \sum_{i=1}^n \color{RoyalBlue}{\frac{\I(A=0)}{\P(A=0 \mid
+      W)}} \color{Goldenrod}{\big\{ \hat{Q}(M_i,W_i) -
+      \hat{\E}[\hat{Q}(M_i,W_i) \mid W_i, A_i = 0] \big\}} \\
+    &+ \frac{1}{n} \sum_{i=1}^n \color{Goldenrod}{
+      \hat{\E}[\hat{Q}(M_i,W_i) \mid W_i,A_i=0]}
+\end{align*}
+
+- In this estimator, you can recognize elements from the G-computation estimator
+  and the weighted estimators:
+  - The third line is the G-computation estimator
+  - The second line is a centered version of the second weighted estimator
+  - The first line is a centered version of the first weighted estimator
+
+- Estimating $\P(M\mid A, W)$ is a very challenging problem when $M$ is
+  high-dimensional. But, since we have the ratio of these conditional densities,
+  we can re-paramterize using Bayes' rule to get something that is easier to
+  compute:
+  \begin{equation*}
+    \frac{\P(M \mid A=0, W)}{\P(M \mid A=1,W)} = \frac{\P(A = 0 \mid M, W)
+      \P(A=1 \mid W)}{\P(A = 1 \mid M, W) \P(A=0 \mid W)} \ .
+  \end{equation*}
+
+Thus we can change the expression of the estimator a bit as follows. First, some
+more notation that will be useful later:
+
+- Let $g(a\mid w)$ denote $\P(A=a\mid W=w)$
+- Let $e(a\mid m, w)$ denote $\P(A=a\mid M=m, W=w)$
+- Let $b(a, m, w)$ denote $\E(Y\mid A=a, M=m, W=w)$
+- The quantity being averaged can be re-expressed as follows
+
+\begin{align*}
+    & \color{RoyalBlue}{\bigg\{ \frac{\I(A=1)}{g(0\mid W)}
+      \frac{e(0\mid M,W)}{e(1\mid M,W)} - \frac{\I(A=0)}{g(0\mid W)}\bigg\}}
+      \times \color{Goldenrod}{[Y - b(A,M,W)]} \\
+    &+ \color{RoyalBlue}{\frac{\I(A=0)}{g(0\mid W)}}
+      \color{Goldenrod}{\big\{Q(M,W) - \E[Q(M,W) \mid W, A=0] \big\}} \\
+    &+ \color{Goldenrod}{\E[Q(M,W) \mid W, A=0]}
+\end{align*}
+
+## How to compute the one-step estimator (akin to Augmented IPW)
+
+First we will generate some data:
+```{r}
+mean_y <- function(m, a, w) abs(w) + a * m
+mean_m <- function(a, w)plogis(w^2 - a)
+pscore <- function(w) plogis(1 - abs(w))
+
+w_big <- runif(1e6, -1, 1)
+trueval <- mean((mean_y(1, 1, w_big) - mean_y(1, 0, w_big)) * mean_m(0, w_big)
+                + (mean_y(0, 1, w_big) - mean_y(0, 0, w_big)) *
+                  (1 - mean_m(0, w_big)))
+
+n <- 1000
+w <- runif(n, -1, 1)
+a <- rbinom(n, 1, pscore(w))
+m <- rbinom(n, 1, mean_m(a, w))
+y <- rnorm(n, mean_y(m, a, w))
+```
+
+Recall that the semiparametric efficient estimator can be computed  in the
+following steps:
+
+1. Fit models for $g(a\mid w)$, $e(a\mid m, w)$, and $b(a, m, w)$
+   - In this example we will use Generalized Additive Models for tractability
+   - In applied settings we recommend using an ensemble of data-adaptive
+     regression algorithms, such as the Super Learner [@vdl2007super]
+
+   ```{r}
+   library(mgcv)
+   ## fit model for E(Y | A, W)
+   b_fit <- gam(y ~ m:a + s(w, by = a))
+   ## fit model for P(A = 1 | M, W)
+   e_fit <- gam(a ~ m + w + s(w, by = m), family = binomial)
+   ## fit model for P(A = 1 | W)
+   g_fit <- gam(a ~ w, family = binomial)
+   ```
+
+2. Compute predictions  $g(1\mid w)$, $g(0\mid w)$, $e(1\mid m, w)$,
+   $e(0\mid m, w)$,$b(1, m, w)$, $b(0, m, w)$, and  $b(a, m, w)$
+
+   ```{r}
+   ## Compute P(A = 1 | W)
+   g1_pred <- predict(g_fit, type = 'response')
+   ## Compute P(A = 0 | W)
+   g0_pred <- 1 - g1_pred
+   ## Compute P(A = 1 | M, W)
+   e1_pred <- predict(e_fit, type = 'response')
+   ## Compute P(A = 0 | M, W)
+   e0_pred <- 1 - e1_pred
+   ## Compute E(Y | A = 1, M, W)
+   b1_pred <- predict(b_fit, newdata = data.frame(a = 1, m, w))
+   ## Compute E(Y | A = 0, M, W)
+   b0_pred <- predict(b_fit, newdata = data.frame(a = 0, m, w))
+   ## Compute E(Y | A, M, W)
+   b_pred  <- predict(b_fit)
+   ```
+
+3. Compute $Q(M, W)$, fit a model for $\E[Q(M,W) \mid W,A]$, and predict at
+   $A=0$
+
+   ```{r}
+   ## Compute Q(M, W)
+   pseudo <- b1_pred - b0_pred
+   ## Fit model for E[Q(M, W) | A, W]
+   q_fit <- gam(pseudo ~ a + w + s(w, by = a))
+   ## Compute E[Q(M, W) | A = 0, W]
+   q_pred <- predict(q_fit, newdata = data.frame(a = 0, w = w))
+   ```
+
+4. Estimate the weights
+
+   \begin{equation*}
+   \color{RoyalBlue}{\bigg\{
+     \frac{\I(A=1)}{g(0\mid W)} \frac{e(0 \mid M,W)}{e(1 \mid M,W)} -
+     \frac{\I(A=0)}{g(0\mid W)} \bigg\}}
+   \end{equation*}
+   using the above predictions:
+   ```{r}
+   ip_weights <- a / g0_pred * e0_pred / e1_pred - (1 - a) / g0_pred
+   ```
+
+5. Compute the uncentered EIF:
+
+   ```{r}
+   eif <- ip_weights * (y - b_pred) + (1 - a) / g0_pred * (pseudo - q_pred) +
+     q_pred
+   ```
+
+6. The one step estimator is the mean of the uncentered EIF
+
+   ```{r}
+   ## One-step estimator
+   mean(eif)
+   ```
+
+## Performance of the one-step estimator in a small simulation study
+
+First, we create a wrapper around the estimator
+
+```{r}
+one_step <- function(y, m, a, w) {
+  b_fit <- gam(y ~ m:a + s(w, by = a))
+  e_fit <- gam(a ~ m + w + s(w, by = m), family = binomial)
+  g_fit <- gam(a ~ w, family = binomial)
+  g1_pred <- predict(g_fit, type = 'response')
+  g0_pred <- 1 - g1_pred
+  e1_pred <- predict(e_fit, type = 'response')
+  e0_pred <- 1 - e1_pred
+  b1_pred <- predict(
+    b_fit, newdata = data.frame(a = 1, m, w), type = 'response'
+  )
+  b0_pred <- predict(
+    b_fit, newdata = data.frame(a = 0, m, w), type = 'response'
+  )
+  b_pred  <- predict(b_fit, type = 'response')
+  pseudo <- b1_pred - b0_pred
+  q_fit <- gam(pseudo ~ a + w + s(w, by = a))
+  q_pred <- predict(q_fit, newdata = data.frame(a = 0, w = w))
+  ip_weights <- a / g0_pred * e0_pred / e1_pred - (1 - a) / g0_pred
+  eif <- ip_weights * (y - b_pred) + (1 - a) / g0_pred *
+    (pseudo - q_pred) + q_pred
+  return(mean(eif))
+}
+```
+
+Let us first examine the bias
+
+- The true value is:
+
+```{r}
+w_big <- runif(1e6, -1, 1)
+trueval <- mean((mean_y(1, 1, w_big) - mean_y(1, 0, w_big)) * mean_m(0, w_big)
+  + (mean_y(0, 1, w_big) - mean_y(0, 0, w_big)) * (1 - mean_m(0, w_big)))
+print(trueval)
+```
+
+- Bias simulation
+
+```{r}
+estimate <- lapply(seq_len(1000), function(iter) {
+  n <- 1000
+  w <- runif(n, -1, 1)
+  a <- rbinom(n, 1, pscore(w))
+  m <- rbinom(n, 1, mean_m(a, w))
+  y <- rnorm(n, mean_y(m, a, w))
+  estimate <- one_step(y, m, a, w)
+  return(estimate)
+})
+estimate <- do.call(c, estimate)
+
+hist(estimate)
+abline(v = trueval, col = "red", lwd = 4)
+```
+
+- And now the confidence intervals:
+
+```{r}
+#| fig-width: 8
+cis <- cbind(
+  estimate - qnorm(0.975) * sd(estimate),
+  estimate + qnorm(0.975) * sd(estimate)
+)
+
+ord <- order(rowSums(cis))
+lower <- cis[ord, 1]
+upper <- cis[ord, 2]
+curve(trueval + 0 * x,
+  ylim = c(0, 1), xlim = c(0, 1001), lwd = 2, lty = 3, xaxt = "n",
+  xlab = "", ylab = "Confidence interval", cex.axis = 1.2, cex.lab = 1.2
+)
+for (i in 1:1000) {
+  clr <- rgb(0.5, 0, 0.75, 0.5)
+  if (upper[i] < trueval || lower[i] > trueval) clr <- rgb(1, 0, 0, 1)
+  points(rep(i, 2), c(lower[i], upper[i]), type = "l", lty = 1, col = clr)
+}
+text(450, 0.10, "n=1000 repetitions = 1000 ", cex = 1.2)
+text(450, 0.01, paste0(
+  "Coverage probability = ",
+  mean(lower < trueval & trueval < upper), "%"
+), cex = 1.2)
+```
+
+## A note about targeted minimum loss-based estimation (TMLE)
+
+- The above estimator is great because it allows us to use data-adaptive
+  regression to avoid bias, while allowing the computation of correct standard
+  errors
+- This estimator has a problem, though:
+  - It can yield answers outside of the bounds of the parameter space
+  - E.g., if $Y$ is binary, it could yield direct and indirect effects outside
+    of $[-1,1]$
+  - To solve this, you can compute a TMLE instead (implemented in the R
+    packages, coming up)
+
+## A note about cross-fitting
+
+- When using data-adaptive regression estimators, it is recommended to use
+  cross-fitted estimators
+- Cross-fitting is similar to cross-validation:
+  - Randomly split the sample into K (e.g., K=10) subsets of equal size
+  - For each of the 9/10ths of the sample, fit the regression models
+  - Use the out-of-sample fit to predict in the remaining 1/10th of the sample
+- Cross-fitting further reduces the bias of the estimators
+- Cross-fitting aids in guaranteeing the correctness of the standard errors and
+  confidence intervals
+- Cross-fitting is implemented by default in the R packages that you will see
+  next
diff --git a/sessions/mcma_estim_prelims.qmd b/sessions/mcma_estim_prelims.qmd
new file mode 100644
index 0000000..953f2bf
--- /dev/null
+++ b/sessions/mcma_estim_prelims.qmd
@@ -0,0 +1,115 @@
+# Overview of doubly robust estimators: Example with the ATE
+
+Recall our motivation for doing mediation analysis --- that is, we would like
+to decompose the total effect of a treatment $A$ on an outcome $Y$ into (two)
+distinct components (direct/indirect effects) that operate through mediator(s)
+$M$ (indirect effect) v. those operating independently of $M$ (direct effect).
+
+Recall that we define the *average treatment effect* (ATE) as $\E(Y_1-Y_0)$,
+and decompose it as follows
+$$
+\E[Y_{1,M_1} - Y_{0,M_0}] = \underbrace{\E[Y_{\color{red}{1},\color{blue}{M_1}}
+    - Y_{\color{red}{1},\color{blue}{M_0}}]}_{\text{natural indirect effect}} +
+    \underbrace{\E[Y_{\color{blue}{1},\color{red}{M_0}} -
+    Y_{\color{blue}{0},\color{red}{M_0}}]}_{\text{natural direct effect}}
+$$
+
+To introduce some of the ideas that we will use for estimation of the NDE, let
+us first briefly discuss estimation of $\E(Y_1)$ (estimation of $\E(Y_0)$ can be
+performed analogously).
+
+First, notice that under the assumption of no unmeasured confounders
+($Y_1\indep A\mid W$), we have
+
+$$ \begin{align*}
+  \E(Y_1) &= \E[ \E(Y_1 \mid W) ] \\
+          &= \E[ \E(Y_1 \mid A=1, W) ] \\
+          &= \E[ \E(Y \mid A=1, W) ] \ \ ,
+\end{align*} $$
+where the first step adds an expectation over $W$ (that is marginalized over),
+the second step uses no unmeasured confounding or exchangeability $(A \indep
+Y_a \mid L)$, and the last step applies consistency ($Y_a = Y$ for $A=a$).
+
+## Plug-in (G-computation) estimator
+
+The first estimator of $\E[ \E(Y \mid A=1, W) ]$ can be obtained by a
+three-step procedure:
+
+1. Fit a regression for $Y$ on $A$ and $W$, then
+2. use the above regression to predict the outcome mean if everyone's
+  $A$ is set to $A=1$, and then
+3. average these predictions.
+
+The resultant estimator can be expressed as
+$$
+  \frac{1}{n} \sum_{i=1}^n \hat{\E}(Y \mid A_i=1, W_i) \ .
+$$
+
+- Note that this plug-in estimator directly uses the above identification
+  formula (called a g-formula, arrived at via g-computation):
+  $\E[\E(Y \mid A=1, W)]$
+- This estimator requires that the (outcome) regression model for
+  $\hat{\E}(Y \mid A_i=1, W_i)$ is correctly specified.
+- Downside: If we use arbitrary machine learning for this model, general theory
+  for computing standard errors and confidence intervals (i.e., statistical
+  inference) is not available.
+
+## Inverse probability weighted (IPW) estimator
+
+An alternative method of estimation can be constructed after noticing the
+following equivalence:
+$$
+  \E[ \E(Y \mid A=1, W) ] = \E\left[ \frac{A}{\P(A=1\mid W)} Y \right] \ ,
+$$
+which may be carried out by way of the following procedure:
+
+1. Fit a regression for $A$ and $W$, then
+2. use the above regression to predict the probability of treatment $A=1$,
+   then
+3. compute the inverse probability weights $A_i / \hat{\P}(A_i =1 \mid W_i)$.
+   This weight will be zero for untreated units, and the inverse of the
+   probability of treatment for treated units.
+5. Finally, compute the weighted average of the outcome:
+   $$
+     \frac{1}{n} \sum_{i=1}^n \frac{A_i}{\hat{\P}(A_i=1 \mid W_i)} Y_i \ .
+   $$
+
+- This estimator requires that the regression model for $\hat{\P}(A=1 \mid W_i)$
+  is correctly specified.
+- Downside: If we use arbitrary machine learning for this model, general theory
+  for computing standard errors and confidence intervals (i.e., statistical
+  inference) is not available.
+
+## Augmented inverse probability weighted (AIPW) estimator
+
+Fortunately, we can combine these two estimators to get an estimator with
+enhanced properties.
+
+The improved estimator can be seen both as a *corrected* (or augmented) IPW
+estimator:
+$$
+  \underbrace{\frac{1}{n} \sum_{i=1}^n \frac{A_i}{\hat{\P}(A_i=1 \mid W_i)}
+  Y_i}_{\text{IPW estimator}} -
+  \underbrace{\frac{1}{n} \sum_{i=1}^n \frac{\hat{\E}(Y \mid A_i=1, W_i)}
+    {\hat{\P}(A_i=1 \mid W_i)}[A_i - \hat{\P}(A_i=1 \mid
+    W_i)]}_{\text{Correction term}} \ ,
+$$
+or
+$$
+  \underbrace{\frac{1}{n} \sum_{i=1}^n \hat{\E}(Y \mid A_i=1,
+    W_i)}_{\text{G-comp estimator}} +
+  \underbrace{\frac{1}{n} \sum_{i=1}^n \frac{A_i}{\hat{\P}(A_i=1\mid W_i)}
+    [Y_i - \hat{\E}(Y \mid A_i=1, W_i)]}_{\text{Correction term}} \ .
+$$
+
+This estimator has some desirable properties:
+
+- It is robust to misspecification of at most one of the two models (outcome or
+  treatment) (Can you see why?)
+- It is distributed as a normal random variable (RV) as sample size grows. This
+  allows us to easily compute confidence intervals and perform hypothesis tests.
+- It allows us to use machine learning to estimate the treatment and outcome
+  regressions to alleviate model misspecification bias.
+
+Next, we will work towards constructing estimators with these same properties
+for the mediation parameters that we have introduced.
diff --git a/sessions/mcma_estim_walkthru.qmd b/sessions/mcma_estim_walkthru.qmd
new file mode 100644
index 0000000..29cb460
--- /dev/null
+++ b/sessions/mcma_estim_walkthru.qmd
@@ -0,0 +1,521 @@
+# `R` packages for estimation of the causal (in)direct effects
+
+We'll now turn to working through a few examples of estimating the natural and
+interventional direct and indirect effects. Note that we will be using the
+[`medoutcon` `R` package](https://github.com/nhejazi/medoutcon), which supports
+multiple mediators and a single binary intermediate confounder, but, if your
+data scenario includes multiple mediators *and* *multiple* intermediate
+confounders, you should consider using the [`HDmediation` `R`
+package](https://github.com/nt-williams/HDmediation) instead.
+
+As our running example, we'll use a simple data set from an observational study of
+the relationship between BMI and kids' behavior, freely distributed with the
+[`mma` `R` package on CRAN](https://CRAN.R-project.org/package=mma). First,
+let's load the packages we'll be using and set a seed; then, load this data set
+and take a quick look.
+
+```{r}
+#| label: mcma-pkg-setup
+library(tidyverse)
+library(sl3)
+library(medoutcon)
+library(medshift)
+library(mma)
+set.seed(429153)
+```
+
+```{r}
+#| label: load-data
+# load and examine data
+data(weight_behavior)
+dim(weight_behavior)
+
+# drop missing values
+weight_behavior <- weight_behavior %>%
+  drop_na() %>%
+  as_tibble()
+weight_behavior
+```
+
+The documentation for the data set describes it as a "database obtained from the
+Louisiana State University Health Sciences Center, New Orleans, by Dr. Richard
+Scribner. He explored the relationship between BMI and kids' behavior through a
+survey at children, teachers and parents in Grenada in 2014. This data set
+includes 691 observations and 15 variables." Note that the data set contained
+several observations with missing values, which we removed above to simplify the
+demonstration of our analytic methods. In practice, we recommend instead using
+appropriate corrections (e.g., imputation, inverse weighting) to fully take
+advantage of the observed data.
+
+Following the motivation of the original study, we focus on the causal effects
+of participating in a sports team (`sports`) on the BMI of children (`bmi`),
+taking into consideration several mediators (`snack`, `exercises`, `overweigh`);
+all other measured covariates are taken to be potential baseline confounders.
+
+## `medoutcon`: Natural and interventional (in)direct effects
+
+The data on a single observational unit can be represented $O = (W, A, M, Y)$,
+with the data pooled across all participants denoted $O_1, \ldots, O_n$, for a
+of $n$ i.i.d. observations of $O$. Recall the DAG [from an earlier
+chapter](#estimands), which represents the data-generating process:
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under *no intermediate confounders* of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+### Natural (in)direct effects
+
+To start, we will consider estimation of the _natural_ direct and indirect effects,
+which, we recall, are defined as follows
+
+$$
+  \E[Y_{1,M_1} - Y_{0,M_0}] =
+    \underbrace{\E[Y_{\color{red}{1},\color{blue}{M_1}} -
+    Y_{\color{red}{1},\color{blue}{M_0}}]}_{\text{natural indirect effect}} +
+    \underbrace{\E[Y_{\color{blue}{1},\color{red}{M_0}} -
+    Y_{\color{blue}{0},\color{red}{M_0}}]}_{\text{natural direct effect}}.
+$$
+
+* Our [`medoutcon` `R` package](https://github.com/nhejazi/medoutcon)
+  [@hejazi2022medoutcon-rpkg; @hejazi2022medoutcon-joss], which accompanies
+  @diaz2020nonparametric, implements one-step and TML estimators of both the
+  natural and interventional (in)direct effects.
+* Both types of estimators are capable of accommodating flexible modeling
+  strategies (e.g., ensemble machine learning) for the initial estimation of
+  nuisance parameters.
+* The `medoutcon` `R` package uses cross-validation in initial estimation: this
+  results in cross-validated (or "cross-fitted") one-step and TML estimators
+  [@klaassen1987consistent; @zheng2011cross; @chernozhukov2018double], which
+  exhibit greater robustness than their non-sample-splitting analogs.
+* To this end, `medoutcon` integrates with the `sl3` `R` package [@coyle2022sl3],
+  which is extensively documented in this [book
+  chapter](https://tlverse.org/tlverse-handbook/sl3) [@phillips2022super;
+  @vdl2022targeted].
+
+### Interlude: `sl3` for nuisance parameter estimation
+
+* To fully take advantage of the one-step and TML estimators, we'd like to rely
+  on flexible, data adaptive strategies for nuisance parameter estimation.
+* Doing so minimizes opportunities for model misspecification to compromise our
+  analytic conclusions.
+* Choosing among the diversity of available machine learning algorithms can be
+  challenging, so we recommend using the Super Learner algorithm for ensemble
+  machine learning [@vdl2007super], which is implemented in the [`sl3` R
+  package](https://github.com/tlverse/sl3) [@coyle2022sl3].
+* Below, we demonstrate the construction of an ensemble learner based on a
+  limited library of algorithms, including n intercept model, a main terms GLM,
+  Lasso ($\ell_1$-penalized) regression, and random forest (`ranger`).
+  ```{r}
+  #| label: mcma_setup_sl
+  # instantiate learners
+  mean_lrnr <- Lrnr_mean$new()
+  fglm_lrnr <- Lrnr_glm_fast$new()
+  lasso_lrnr <- Lrnr_glmnet$new(alpha = 1, nfolds = 3)
+  mars_lrnr <- Lrnr_earth$new()
+  rf_lrnr <- Lrnr_ranger$new(num.trees = 200)
+
+  # create learner library and instantiate super learner ensemble
+  lrnr_lib <- Stack$new(mean_lrnr, fglm_lrnr, lasso_lrnr, mars_lrnr, rf_lrnr)
+  sl_lrnr <- Lrnr_sl$new(learners = lrnr_lib)
+  ```
+
+* Of course, there are many alternatives for learning algorithms to be included
+  in such a modeling library. Feel free to explore!
+
+### Efficient estimation of the natural (in)direct effects
+
+* Estimation of the natural direct and indirect effects requires estimation of a
+  few nuisance parameters. Recall that these are
+  - $g(a\mid w)$, which denotes $\P(A=a \mid W=w)$
+  - $h(a\mid m, w)$, which denotes $\P(A=a \mid M=m, W=w)$
+  - $b(a, m, w)$, which denotes $\E(Y \mid A=a, M=m, W=w)$
+* While we recommend the use of Super Learning, we opt to instead estimate all
+  nuisance parameters with Lasso regression below (to save computational time).
+* Now, let's use the `medoutcon()` function to estimate the _natural direct
+  effect_:
+  ```{r}
+  #| label: mcma-natural-de-os
+  # compute one-step estimate of the natural direct effect
+  nde_onestep <- medoutcon(
+    W = weight_behavior[, c("age", "sex", "race", "tvhours")],
+    A = (as.numeric(weight_behavior$sports) - 1),
+    Z = NULL,
+    M = weight_behavior[, c("snack", "exercises", "overweigh")],
+    Y = weight_behavior$bmi,
+    g_learners = lasso_lrnr,
+    h_learners = lasso_lrnr,
+    b_learners = lasso_lrnr,
+    effect = "direct",
+    estimator = "onestep",
+    estimator_args = list(cv_folds = 5)
+  )
+  summary(nde_onestep)
+  ```
+
+* We can similarly call `medoutcon()` to estimate the _natural indirect effect_:
+
+```{r}
+#| label: mcma-natural-ie-os
+# compute one-step estimate of the natural indirect effect
+nie_onestep <- medoutcon(
+  W = weight_behavior[, c("age", "sex", "race", "tvhours")],
+  A = (as.numeric(weight_behavior$sports) - 1),
+  Z = NULL,
+  M = weight_behavior[, c("snack", "exercises", "overweigh")],
+  Y = weight_behavior$bmi,
+  g_learners = lasso_lrnr,
+  h_learners = lasso_lrnr,
+  b_learners = lasso_lrnr,
+  effect = "indirect",
+  estimator = "onestep",
+  estimator_args = list(cv_folds = 5)
+)
+summary(nie_onestep)
+```
+
+* From the above, we can conclude that the effect of participation on a sports
+  team on BMI is primarily mediated by the variables `snack`, `exercises`, and
+  `overweigh`, as the natural indirect effect is several times larger than the
+  natural direct effect.
+* Note that we could have instead used the TML estimators, which have improved
+  finite-sample performance, instead of the one-step estimators. Doing this is
+  as simple as setting the `estimator = "tmle"` in the relevant argument.
+
+### Interventional (in)direct effects
+
+Since our knowledge of the system under study is incomplete, we might worry that
+one (or more) of the measured variables are not mediators, but, in fact,
+intermediate confounders affected by treatment. While the natural (in)direct
+effects are not identified in this setting, their interventional (in)direct
+counterparts are, as we saw in an earlier section. Recall that both types of
+effects are defined by static interventions on the treatment. The interventional
+effects are distinguished by their use of a stochastic intervention on the
+mediator to aid in their identification.
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under intermediate confounders of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{0, -1}{Z}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \ArrowR{W}{Z}{black}
+  \Arrow{Z}{M}{black}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \Arrow{A}{Z}{black}
+  \Arrow{Z}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+Recall that the interventional (in)direct effects are defined via the
+decomposition:
+
+$$
+  \E[Y_{1,G_1} - Y_{0,G_0}] =
+    \underbrace{\E[Y_{\color{red}{1},\color{blue}{G_1}} -
+    Y_{\color{red}{1},\color{blue}{G_0}}]}_{\text{interventional indirect effect}} +
+    \underbrace{\E[Y_{\color{blue}{1},\color{red}{G_0}} -
+    Y_{\color{blue}{0},\color{red}{G_0}}]}_{\text{interventional direct effect}}
+$$
+
+* In our data example, we'll consider the eating of snacks as a potential
+  intermediate confounder, since one might reasonably hypothesize that
+  participation on a sports team might subsequently affect snacking, which then
+  could affect mediators like the amount of exercises and overweight status.
+* The interventional direct and indirect effects may also be easily estimated
+  with the [`medoutcon` `R` package](https://github.com/nhejazi/medoutcon)
+  [@hejazi2022medoutcon-rpkg; @hejazi2022medoutcon-joss].
+* Just as for the natural (in)direct effects, `medoutcon` implements
+  cross-validated one-step and TML estimators of the interventional effects.
+
+### Efficient estimation of the interventional (in)direct effects
+
+* Estimation of these effects is more complex, so a few additional nuisance
+  parameters arise when expressing the (more general) EIF for these effects:
+  * $q(z \mid a, w)$, the conditional density of the intermediate confounders,
+    conditional only on treatment and baseline covariates;
+  * $r(z \mid a, m, w)$, the conditional density of the intermediate
+    confounders, conditional on mediators, treatment, and baseline covariates.
+* To estimate the interventional effects, we only need to set the argument `Z`
+  of `medoutcon` to a value other than `NULL`.
+* Note that the implementation in `medoutcon` is currently limited to settings
+  with only binary intermediate confounders, i.e., $Z \in \{0, 1\}$.
+* Let's use `medoutcon()` to estimate the _interventional direct effect_:
+  ```{r}
+  #| label: mcma-interv-de-os
+  # compute one-step estimate of the interventional direct effect
+  interv_de_onestep <- medoutcon(
+    W = weight_behavior[, c("age", "sex", "race", "tvhours")],
+    A = (as.numeric(weight_behavior$sports) - 1),
+    Z = (as.numeric(weight_behavior$snack) - 1),
+    M = weight_behavior[, c("exercises", "overweigh")],
+    Y = weight_behavior$bmi,
+    g_learners = lasso_lrnr,
+    h_learners = lasso_lrnr,
+    b_learners = lasso_lrnr,
+    effect = "direct",
+    estimator = "onestep",
+    estimator_args = list(cv_folds = 5)
+  )
+  summary(interv_de_onestep)
+  ```
+
+* We can similarly estimate the _interventional indirect effect_:
+  ```{r}
+  #| label: mcma-interv-ie-os
+  # compute one-step estimate of the interventional indirect effect
+  interv_ie_onestep <- medoutcon(
+    W = weight_behavior[, c("age", "sex", "race", "tvhours")],
+    A = (as.numeric(weight_behavior$sports) - 1),
+    Z = (as.numeric(weight_behavior$snack) - 1),
+    M = weight_behavior[, c("exercises", "overweigh")],
+    Y = weight_behavior$bmi,
+    g_learners = lasso_lrnr,
+    h_learners = lasso_lrnr,
+    b_learners = lasso_lrnr,
+    effect = "indirect",
+    estimator = "onestep",
+    estimator_args = list(cv_folds = 5)
+  )
+  summary(interv_ie_onestep)
+  ```
+
+* From the above, we can conclude that the effect of participation on a sports
+  team on BMI is largely through the interventional indirect effect (i.e.,
+  through the pathways involving the mediating variables) rather than via its
+  direct effect.
+* Just as before, we could have instead used the TML estimators, instead of the
+  one-step estimators. Doing this is as simple as setting the
+  `estimator = "tmle"` in the relevant argument.
+
+<!--
+## `medshift`: Stochastic (in)direct effects
+
+While the analyses using the natural and interventional effects have been
+illuminating, we may also go beyond the restrictive static interventions
+required to define these (in)direct effects. In fact, it may be more realistic
+to consider interventions that do not directly force children to join athletic
+teams, but instead motivate them to make their participation on such teams more
+likely. Importantly, such interventions are often far more realistic and
+actionable in real-world studies.
+
+### Formulating the stochastic (in)direct effects
+
+* These more flexible intervention regimes are incompatible with (in)direct
+  effect definitions based on decomposing the average treatment effect.
+* Instead, consider the decomposition of the population intervention effect
+  (PIE) of a _stochastic intervention_ into direct and indirect effects
+  [@diaz2020causal]:
+  $$ \begin{align*}
+    \E[Y&_{A_\delta,M_{A_\delta}} - Y_{A,M_A}] = \\
+    &\underbrace{\E[Y_{\color{red}{A_\delta},\color{blue}{M_{A_\delta}}} -
+      Y_{\color{red}{A_\delta},
+      \color{blue}{M}}]}_{\text{stochastic natural indirect effect}} +
+      \underbrace{\E[Y_{\color{blue}{A_\delta},\color{red}{M}} -
+      Y_{\color{blue}{A},
+      \color{red}{M}}]}_{\text{stochastic natural direct effect}}
+  \end{align*} $$
+* Recall from our discussion of the [incremental propensity score
+  interventions](#ipsi) [@kennedy2018nonparametric] that such stochastic
+  interventions can compare the pre- and post-intervention odds of exposure:
+  $$
+    \delta = \frac{\text{odds}(A_\delta = 1\mid W=w)}
+    {\text{odds}(A = 1\mid W=w)}.
+  $$
+* In our analysis, we will modulate the _odds of participating in a sports
+  team_ by a fixed amount for each individual, setting, for example,
+  $\delta = 2$:
+  ```{r}
+  #| label: delta_ipsi
+  delta_shift_ipsi <- 2
+  ```
+* Such an intervention may be interpreted as the effect of a school program
+  that motivates children to participate in sports teams.
+
+### Efficient estimation of the stochastic (in)direct effects
+
+* The decomposition of the PIE into the direct and indirect effects leads to a
+  common term $\E[Y_{\color{red}{A_\delta},\color{blue}{M}}]$ involved in both
+  the direct and indirect effect definitions. This term may be estimated via
+  the [`medshift` `R` package](https://github.com/nhejazi/medshift)
+  [@hejazi2020medshift].
+* For the direct effect, the remaining term is the
+  $\E[Y_{\color{blue}{A},\color{red}{M}}]$, which may be estimated by a simple
+  mean in the observed data (i.e., no intervention).
+* For the indirect effect, the remaining term is the joint effect of stochastic
+  interventions on both $A$ and $M$:
+  $\E[Y_{\color{red}{A_\delta},\color{blue}{M_{A_\delta}}}$.
+  * For the case of an IPSI on binary $A$, this may be estimated by the tools
+    in the [`npcausal` `R` package](https://github.com/ehkennedy/npcausal).
+  * For the case of an MTP on continuous $A$, this may be estimated by the
+    tools in the [`txshift` `R` package](https://github.com/nhejazi/txshift)
+    [@hejazi2020txshift-rpkg; @hejazi2020txshift-joss].
+* Like the implementation in `medoutcon`, the `medshift` package makes use of
+  cross-validation in constructing initial estimates of nuisance parameters,
+  resulting in more robust, cross-validated efficient estimators
+  [@klaassen1987consistent; @zheng2011cross; @chernozhukov2018double].
+* Now, we're ready to use the `medshift` function to estimate the decomposition
+  term common to both the _stochastic direct and indirect effects_:
+  ```{r}
+  #| label: stoch_decomp_os
+  # compute one-step estimate of decomposition term of the (in)direct effects
+  stoch_decomp_onestep <- medshift(
+    W = weight_behavior[, c("age", "sex", "race", "tvhours")],
+    A = (as.numeric(weight_behavior$sports) - 1),
+    Z = weight_behavior[, c("snack", "exercises", "overweigh")],
+    Y = weight_behavior$bmi,
+    delta = delta_shift_ipsi,
+    g_learners = lasso_lrnr,
+    e_learners = lasso_lrnr,
+    m_learners = lasso_lrnr,
+    estimator = "onestep",
+    estimator_args = list(cv_folds = 5)
+  )
+  summary(stoch_decomp_onestep)
+  ```
+
+* To estimate the stochastic direct effect, an extra step is necessary -- we
+  must apply the delta method:
+  ```{r}
+  #| label: linear_contrast_delta
+  # convenience function to compute inference via delta method: EY1 - EY0
+  linear_contrast <- function(params, eifs, ci_level = 0.95) {
+    # bounds for confidence interval
+    ci_norm_bounds <- c(-1, 1) * abs(stats::qnorm(p = (1 - ci_level) / 2))
+    param_est <- params[[1]] - params[[2]]
+    eif <- eifs[[1]] - eifs[[2]]
+    se_eif <- sqrt(var(eif) / length(eif))
+    param_ci <- param_est + ci_norm_bounds * se_eif
+    # parameter and inference
+    out <- c(param_ci[1], param_est, param_ci[2])
+    names(out) <- c("lwr_ci", "param_est", "upr_ci")
+    return(out)
+  }
+  ```
+* Straightforward application of this procedure yields,
+  ```{r}
+  #| label: stoch_de_ipsi
+  # parameter estimates and EIFs for components of direct effect
+  EY <- mean(weight_behavior$bmi)
+  eif_EY <- weight_behavior$bmi - EY
+  params_de <- list(stoch_decomp_onestep$theta, EY)
+  eifs_de <- list(stoch_decomp_onestep$eif, eif_EY)
+
+  # direct effect = EY - estimated quantity
+  de_est <- linear_contrast(params_de, eifs_de)
+  de_est
+  ```
+* From the above, we can conclude that the effect of increasing the odds of
+  participation on a sports team on BMI leads only to a relatively small direct
+  effect.
+-->
diff --git a/sessions/mcma_longit_effects.qmd b/sessions/mcma_longit_effects.qmd
new file mode 100644
index 0000000..f62783b
--- /dev/null
+++ b/sessions/mcma_longit_effects.qmd
@@ -0,0 +1,504 @@
+# Time-varying treatments, mediators, and covariates
+
+We'll now turn to thinking about how to define and estimate effects
+for cases where treatment, mediators, and covariates are time-varying.
+
+Before we delve into the specific, here are a couple of papers on this
+topic that are interesting/useful:
+
+- [Mediation analysis with time varying exposures and
+  mediators](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/rssb.12194)
+  by Tyler J. VanderWeele and Eric J. Tchetgen Tchetgen
+- [Longitudinal Mediation Analysis with Time-varying Mediators and
+  Exposures, with Application to Survival
+  Outcomes](https://www.degruyter.com/document/doi/10.1515/jci-2016-0006/html)
+  by  Wenjing Zheng and Mark J. van der Laan
+- [Efficient and flexible causal mediation with time-varying
+  mediators, treatments, and
+  confounders](https://www.degruyter.com/document/doi/10.1515/jci-2022-0077/html)
+  by Iván Díaz, Nicholas Williams, and Kara E. Rudolph
+- [Identification and estimation of mediational effects of
+  longitudinal modified treatment
+  policies](https://arxiv.org/abs/2403.09928) by Brian Gilbert,
+  Katherine L. Hoffman, Nicholas Williams, Kara E. Rudolph, Edward
+  J. Schenck, Iván Díaz.
+
+In this chapter, we will be using the [`lcm` `R`
+package](https://github.com/nt-williams/lcm), which supports binary
+time-varying treatments and categorical mediators, and the [`lcmmtp`
+`R` package](https://github.com/nt-williams/lcmmtp) package, which
+supports continuous, multivariate, and binary time-varying treatments
+and categorical mediators.
+
+## Illustrative example
+
+As an illustrative example, we will use a dataset from an
+observational study looking at the effect of invasive mechanical
+ventilation (IMV) on the survival of COVID-19 patients, considering
+acute kidney injury (AKI) as a mediating factor.
+
+Briefly, IMV is a treatment for acute respiratory distress syndrome
+(ARDS). While IMV is a potentially life-saving therapy in patients
+with ARDS, its usage has been associated with several iatrogenic
+risks. Of interest to our illustrative study is acute kidney injury (AKI),
+a critical condition that complicates ICU stays and is associated with
+increased mortality. The causal model underlying this problem is as
+follows:
+
+The data on a single observational unit can be represented by the
+vector $O=(L_1,
+A_1, Z_1, M_1, L_2, \ldots, A_\tau, Z_\tau, M_\tau, Y)$, with the data
+pooled across all participants denoted $O_1, \ldots, O_n$, for a of
+$n$ i.i.d. observations of $O$. The associated DAG is
+
+```{tikz}
+#| fig-cap: Directed acyclic graph for time-varying treatments, mediators, and confounders
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[3]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {#3};
+}
+\pgfarrowsdeclare{arcs}{arcs}{...}
+{
+  \pgfsetdash{}{0pt} % do not dash
+  \pgfsetroundjoin   % fix join
+  \pgfsetroundcap    % fix cap
+  \pgfpathmoveto{\pgfpoint{-5pt}{5pt}}
+  \pgfpatharc{180}{270}{5pt}
+  \pgfpatharc{90}{180}{5pt}
+  \pgfusepathqstroke
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowB}[3]%
+{ \begin{pgfonlayer}{background}
+    \draw[|-arcs,line width=0.4mm,shorten <= 0.3cm,shorten >= 0.3cm,#3] (#1) -- +(#2);
+  \end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+    \Vertex{-1, 0}{W}{$L_1$}
+    \Vertex{1, 0}{A1}{$A_1$}
+    \Vertex{2, 0}{M1}{$M_1$}
+    \Vertex{3, 0}{Z1}{$L_2$}
+    \Vertex{2, -1}{L1}{$Z_1$}
+
+    \ArrowB{W}{A1}{black}
+
+    \Arrow{A1}{M1}{black}
+    \Arrow{A1}{L1}{black}
+    \Arrow{M1}{Z1}{black}
+    \Arrow{L1}{M1}{black}
+    \Arrow{L1}{Z1}{black}
+    \ArrowL{A1}{Z1}{black}
+
+    \Vertex{5, 0}{A2}{$A_2$}
+    \Vertex{6, 0}{M2}{$M_2$}
+    \Vertex{7, 0}{Z2}{$L_3$}
+    \Vertex{6, -1}{L2}{$Z_2$}
+    \node (dots) at (8, 0) {$\cdots$};
+    \Arrow{A2}{M2}{black}
+    \Arrow{A2}{L2}{black}
+    \Arrow{M2}{Z2}{black}
+    \Arrow{L2}{M2}{black}
+    \Arrow{L2}{Z2}{black}
+    \ArrowL{A2}{Z2}{black}
+
+    \ArrowB{Z1}{A2}{black}
+
+    \Vertex{9, 0}{At}{$A_\tau$}
+    \Vertex{10, 0}{Mt}{$M_\tau$}
+    \Vertex{11, 0}{Zt}{$Y$}
+    \Vertex{10, -1}{Lt}{$Z_\tau$}
+    \Arrow{At}{Mt}{black}
+    \Arrow{At}{Lt}{black}
+    \Arrow{Mt}{Zt}{black}
+    \Arrow{Lt}{Mt}{black}
+    \Arrow{Lt}{Zt}{black}
+    \ArrowL{At}{Zt}{black}
+\end{tikzpicture}
+```
+
+Where we are using the following notation:
+
+- $W$: baseline variables such as comorbidities, demographics, etc.
+- $L_t$ and $Z_t$: time-varying covariates such as lab results,
+  vitals, treatments, etc.
+- $A_t$: type of oxygen support at time $t$ (0: no oxygen support, 1:
+  oxygen support excluding IMV, 2: IMV)
+- $M_t$: indicator of AKI at time $t$
+- $Y$: mortality at end of study
+- We will use $H_t$ as a shorthand to denote all the data measured up
+  until right before $A_t$ occurs
+
+## Defining causal effects in this example
+
+How can we define (total) causal effects in this example?
+
+- Main challenge: cannot consider static treatmemt regimes (e.g., do
+  not intubate)
+- Such regimes would not be supported in the data (doctors would always intubate a person whose blood oxygen is too low)
+- We use *modified treatment policies* to address this
+- Main idea: consider a slight modification to the treatment a patient
+  actually received
+- For example, can consider the effect of a small delay in receiving
+  IMV
+- Specifically, we will consider a delay of one day in receiving IMV
+- In notation, the treatment regime would be as follows:
+
+\begin{equation}
+  d_t(a_t,h_t) =
+  \begin{cases}
+    1 &\text{ if } a_t=2 \text{ and } a_s \leq 1 \text{ for all } s < t,\\
+    a_t & \text{ otherwise.}
+  \end{cases}
+\end{equation}
+- We could then define the total effect as $E[Y(d) - Y]$, where $Y(d)$
+  is the counterfactual mortality if the above rule had been
+  implemented every day, i.e., the intervention is $d=(d_1,d_2,\ldots,
+  d_\tau)$.
+- This is a contrast of the mortality rate under a treatment rule that would
+  delay intubation by one day vs the mortality rate that was actually observed.
+
+## How do we define mediation causal effects with time-varying data?
+
+The above causal effect could be decomposed into natural direct and
+indirect effects as follows
+
+\begin{align*}
+E[Y(d) - Y] & = E[Y(d, M(d)) - Y(A, M)]\\
+&=\underbrace{\E[Y(\color{red}{d},\color{blue}{M(d)}) -
+    Y(\color{red}{d},\color{blue}{M})]}_{\text{natural indirect effect}} +
+    \underbrace{\E[Y(\color{blue}{d},\color{red}{M}) -
+    Y(\color{blue}{A},\color{red}{M})]}_{\text{natural direct effect}}
+\end{align*}
+
+- However, as before, these natural mediation effects are not identified.
+
+- The reason is that time-varying mediators exacerbate the issue of intermediate
+confounding. To see why, let us look at the DAG again:
+
+```{tikz}
+#| fig-cap: Directed acyclic graph with intermediate confounding
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[3]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {#3};
+}
+\pgfarrowsdeclare{arcs}{arcs}{...}
+{
+  \pgfsetdash{}{0pt} % do not dash
+  \pgfsetroundjoin   % fix join
+  \pgfsetroundcap    % fix cap
+  \pgfpathmoveto{\pgfpoint{-5pt}{5pt}}
+  \pgfpatharc{180}{270}{5pt}
+  \pgfpatharc{90}{180}{5pt}
+  \pgfusepathqstroke
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowB}[3]%
+{ \begin{pgfonlayer}{background}
+    \draw[|-arcs,line width=0.4mm,shorten <= 0.3cm,shorten >= 0.3cm,#3] (#1) -- +(#2);
+  \end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+    \Vertex{-1, 0}{W}{$L_1$}
+    \Vertex{1, 0}{A1}{$A_1$}
+    \Vertex{2, 0}{M1}{$M_1$}
+    \Vertex{3, 0}{Z1}{$L_2$}
+    \Vertex{2, -1}{L1}{$Z_1$}
+
+    \ArrowB{W}{A1}{black}
+
+    \Arrow{A1}{M1}{black}
+    \Arrow{A1}{L1}{black}
+    \Arrow{M1}{Z1}{black}
+    \Arrow{L1}{M1}{black}
+    \Arrow{L1}{Z1}{black}
+    \ArrowL{A1}{Z1}{black}
+
+    \Vertex{5, 0}{A2}{\textcolor{teal}{$A_2$}}
+    \Vertex{6, 0}{M2}{\textcolor{orange}{$M_2$}}
+    \Vertex{7, 0}{Z2}{\textcolor{orange}{$L_3$}}
+    \Vertex{6, -1}{L2}{\textcolor{orange}{$Z_2$}}
+    \node (dots) at (8, 0) {$\cdots$};
+    \Arrow{A2}{M2}{black}
+    \Arrow{A2}{L2}{black}
+    \Arrow{M2}{Z2}{black}
+    \Arrow{L2}{M2}{black}
+    \Arrow{L2}{Z2}{black}
+    \ArrowL{A2}{Z2}{black}
+
+    \ArrowB{Z1}{A2}{black}
+
+    \Vertex{9, 0}{At}{\textcolor{orange}{$A_\tau$}}
+    \Vertex{10, 0}{Mt}{\textcolor{violet}{$M_\tau$}}
+    \Vertex{11, 0}{Zt}{\textcolor{violet}{$Y$}}
+    \Vertex{10, -1}{Lt}{\textcolor{orange}{$Z_\tau$}}
+    \Arrow{At}{Mt}{black}
+    \Arrow{At}{Lt}{black}
+    \Arrow{Mt}{Zt}{black}
+    \Arrow{Lt}{Mt}{black}
+    \Arrow{Lt}{Zt}{black}
+    \ArrowL{At}{Zt}{black}
+\end{tikzpicture}
+```
+Note that all the variables in orange are confounders of the
+  mediator $M_\tau$ and the outcome, and are
+  also affected by treatment at time $t=2$.
+
+One possible solution to the above issues involves considering
+randomized versions of the above effects. Specifically:
+
+- Define $G(d)$ to be a random draw from the distribution of $M(d)$
+  conditional on baseline variables $W$.
+- We can then obtain the decomposition
+  $$
+    \E[Y(d, G(d)) - Y(A, G(A))]=\underbrace{\E[Y(\color{red}{d},\color{blue}{G(d)}) -
+    Y(\color{red}{d},\color{blue}{G(A)})]}_{\text{randomized interventional indirect effect}} +
+    \underbrace{\E[Y(\color{blue}{d},\color{red}{G(A)}) -
+      Y(\color{blue}{A},\color{red}{G(A)})]}_{\text{randomized interventional
+        direct effect}}
+  $$
+- As an example, consider the counterfactual $Y(d, G(d))$.
+- $M(d)$ is the observed AKI status of patients under a delay in intubation. If
+  $W$ is age, and we are deciding how to intervene on a patient who is 45 years
+  old, we take all the AKI statuses of 45 year olds and draw one of these AKI
+  values at random. Call this random draw $G(d)$
+- For a 45 year old patient, $Y(d, G(d))$ is the counterfactual mortality of
+  a patient if intubation had been delayed, and their AKI status would have
+  been assigned to a random draw from the AKI status of 45 year patients.
+
+## Identification assumptions and formula
+
+The above effects are identified under the following assumptions:
+
+- All the common causes of $A_t$ and $(Z_s, M_s, A_{s+1}, L_{s+1})$ are
+  measured for $s\geq t$
+- All the common causes of $M_t$ and $(Z_{s+1}, A_{s+1}, L_{s+1})$ are measured
+  for $s\geq t$
+- The intervention $d$ is supported in the data, meaning that for every patient
+  with covariates $h_t$ who had treatment status $a_t$, it is possible to find
+  a patient with covariates $h_t$ who had treatment status $d(a_t, h_t)$
+  - In our example, this translates roughly as: for every patient with
+    covariate history $h_t$ who was intubated at time $t$, it is possible to
+    find a patient covariate history $h_t$ who was intubated at time $t+1$.
+- There is a positive probability of the mediator $M_t$ for all feasible
+  covariate histories.
+
+The identification formula is complex, but we will explain it in the case of
+two time points. That is, assume the data are $O=(W, A_1, Z_1, M_1, L_1, A_2,
+Z_2, M_2, Y)$.
+
+Identification can be based on the following procedure. First, for each value
+$m_1$ and $m_2$ of the mediator, compute outcome regressions as follows:
+
+1. Regress $Y$ on $W, A_1, Z_1, M_1, L_1, A_2, Z_2, M_2$. Use this
+   regression to predict the outcome had the mediator been set to
+   $M_2=m_2$. Let $\tilde Y_2(m_2)$ denote this prediction.
+2. Regress $\tilde Y_2(m_2)$ on $W, A_1, Z_1, M_1, L_1, A_2$. Use this
+   regression to predict the outcome had the treatment $A_2$ been set to
+   $d(A_2, H_2)$. Let $\tilde Y_2^d(m_2)$ denote this prediction.
+3. Regress $\tilde Y_2(m_2, d_2)$ on $W, A_1, Z_1, M_1$. Use this
+   regression to predict the outcome had the mediator been set to
+   $M_1=m_1$. Let $\tilde Y_1(m_1, m_2)$ denote this prediction.
+4. Regress $\tilde Y_1(m_1, m_2)$ on $W, A_1$. Use this
+   regression to predict the outcome had the treatment $A_1$ been set to
+   $d(A_1, H_1)$. Let $\tilde Y_1^d(m_1, m_2)$ denote this prediction.
+
+Then, for each value $m_1$ and $m_2$ of the mediator, compute the mediator
+distribution as follows:
+
+1. Regress the binary variable $I(M_2=m_2)$ on $W, A_1, Z_1, M_1, L_1, A_2$.
+   Use this model to predict the probability of $M_2=m_2$ under an intervention
+   that sets $A_2$ to $d(A_2, H_2)$. Let this predicted probability be denoted
+   with $P(m_2)$.
+2. Regress the binary variable $I(M_1=m_1)P(m_2)$ on $W, A_1$. Use this model
+   to predict under an intervention that sets $A_1$ to $d(A_1, H_1)$. Let this
+   prediction be denoted with $P(m_1, m_2)$.
+
+At the end of these two sequential regression procedures, we have
+values $\tilde Y_1^d(m_1, m_2)$ and $P(m_1, m_2)$ for each value of
+the mediator $(m_1, m_2)$. Then, under identification assumptions, we
+have:
+
+$$
+  \E[Y(d, G(d)) = \sum_{m_1, m_2}\tilde Y_1^d(m_1, m_2)P(m_1, m_2) \ .
+$$
+
+## Estimators and `R` package
+
+As before, we can develop inverse probability weighted estimators, as well as
+substitution estimators based on the g-computation formula and doubly robust
+(DR) estimators.
+
+All of these estimators get significantly more complex. For instance, an
+g-computation estimator may be developed by running the regressions indicated
+in the above sequential regression procedures.
+
+Fortunately, the doubly robust estimators are coded in a package that
+can be used off-the-shelf without having to code any complicated
+sequential regression strategies on your own. Let us look at an
+example from the `lcmmtp` R package. First, let's take a look at a
+simulated dataset available in the package:
+
+```{r}
+#| label: longit-data
+library(lcmmtp)
+data <- as.data.frame(apply(lcmmtp_foo, 2, as.numeric))
+head(data)
+dim(lcmmtp_foo)
+```
+
+Now, let us perform an analysis where we assume our intent is to estimate
+$\E[Y(1), G(0)]$:
+
+```{r}
+#| label: longit-analysis1
+library(mlr3extralearners)
+vars <- lcmmtp:::lcmmtp_variables$new(
+    L = list(c("L_1"), c("L_2")),
+    A = c("A_1", "A_2"),
+    Z = list(c("Z_1"), c("Z_2")),
+    M = c("M_1", "M_2"),
+    Y = "Y",
+    cens = c("c1", "c2")
+)
+lrnrs <- c("mean", "earth", "glm")
+d_ap <- function(data, trt) rep(1, length(data[[trt]]))
+d_as <- function(data, trt) rep(0, length(data[[trt]]))
+
+EY10 <- lcmmtp(
+  data, vars, d_ap, d_as,
+  control = .lcmmtp_control(folds = 2,
+                            learners_trt = lrnrs,
+                            learners_mediator = lrnrs,
+                            learners_QL = lrnrs,
+                            learners_QZ = lrnrs,
+                            learners_QM = lrnrs)
+)
+```
+
+Now, assume that we want to estimate the direct effect by contrasting
+$\E[Y(1), G(0)] - \E[Y(0), G(0)]$:
+
+```{r}
+#| label: longit-analysis2
+EY00 <- lcmmtp(
+  lcmmtp_foo, vars, d_as, d_as,
+  control = .lcmmtp_control(folds = 2,
+                            learners_trt = lrnrs,
+                            learners_mediator = lrnrs,
+                            learners_QL = lrnrs,
+                            learners_QZ = lrnrs,
+                            learners_QM = lrnrs)
+)
+```
+
+And we can contrast the two using a convenient function from the `lmtp`
+R package:
+
+```{r}
+#| label: contrast
+library(lmtp)
+class(EY00) <- class(EY10) <- 'lmtp'
+EY00$estimator <- EY10$estimator <- 'SDR'
+names(EY00)[3] <- names(EY10)[3] <- 'eif'
+lmtp_contrast(EY10, ref = EY00)
+```
+
+## Pros and cons of this methodology
+
+#### Pros {.unnumbered}
+
+- Allows the non-parametric definition, identification, and estimation of
+  mediational causal effects for general longitudinal data structures
+- Allows for the use of machine learning to alleviate model
+  misspecification bias, and is equipped with formulas for the
+  computation of correct standard errors and confidence intervals
+- Easy-to-use software
+
+#### Cons {.unnumbered}
+
+- Some limitations remain: mediators $M$ need to be discrete random variables
+- As before, interventional effects do not satisfy the mediational sharp null
+  criteria, meaning that they may be different from zero when no individual in
+  the population experiences mediational effects
+  - This is probably not a big worry in practice, but it is something
+    we are keeping in mind as we develop novel estimators
diff --git a/sessions/mcma_preface.qmd b/sessions/mcma_preface.qmd
new file mode 100644
index 0000000..7907895
--- /dev/null
+++ b/sessions/mcma_preface.qmd
@@ -0,0 +1,308 @@
+# "Modern" causal mediation analysis
+
+## Motivating study
+
+- A recent, large, multi-site trial (X:BOT) compared the effectiveness of XR-NTX
+  to buprenorphine–naloxone (BUP-NX) in preventing relapse among those with OUD
+  starting medication in inpatient treatment settings.
+- An analysis of potential moderators of medication effectiveness found that
+  homeless individuals had a lower risk of relapse on XR-NTX, whereas
+  non-homeless individuals had a lower risk of relapse on BUP-NX.
+- The effect sizes were similarly large for these groups but in opposite
+  directions.
+- The underlying mechanisms for these differences have not yet been explored. We
+  can to identify these mechanisms by using mediation analysis.
+- We can to use mediation analysis to explore the mechanisms underlying these
+  differences.
+
+- Key questions:
+  - Do differences in the effects of treatment (comparing two medications for
+    opioid use disorder, naltrexone v. buprenorphine) on risk of relapse
+    operate through mediators of adherence, opioid use, pain, and depressive
+    symptoms? [@rudolph2020explaining]
+  - Are those mediated effects different for homeless vs non-homeless
+    individuals?
+
+![](../images/ctndag.png)
+
+## What is causal mediation analysis?
+
+- Statistical mediation analyses assess associations between the
+  variables. They can help you establish, for example, if the
+  _association_ between treatment and outcome can be mostly explained
+  by an _association_ between treatment and mediator
+- Causal mediation analyses, on the other hand, seek to asess causal
+  relations. For example, they help you establish whether treatment
+  _causes_ the outcome because it _causes_ the mediator. To do this,
+  causal mediation seek to understand how the
+  paths behave under circumstances different from the observed
+  circumstances (e.g., interventions)
+
+<!--- Causal mediation analysis is thus useful to understand mechanisms-->
+
+### Why are the causal methods that we will discuss today important?
+
+- Assume you are interested in the effect of treatment assignment $A$
+  (naltrexone vs. buprenorphine) on an outcome $Y$ (risk of relapse) through
+  mediators $M$ (e.g., opioid use, pain, depressive symptoms)
+- We have pre-treatment confounders $W$
+- There is a confounder $Z$ of $M \rightarrow Y$ affected by treatment
+  assignment (with adherence)
+- We could fit the following models:
+    $$ \begin{align*}
+      \E(M \mid A=a, W=w, Z=z) & = \gamma_0 + \gamma_1 a + \gamma_2 w + \gamma_3 z \\
+      \E(Y \mid M=m, A=a, W=w, Z=z) & = \beta_0 + \beta_1 m + \beta_2 a +
+        \beta_3 w + \beta_4 z
+    \end{align*} $$
+- The product $\gamma_1 \beta_1$ has been proposed as a measure of the effect
+  of $A$ on $Y$ through $M$
+- Causal interpretation problems with this method: We will see that this
+  parameter cannot be interpreted as a causal effect
+
+### `R` Example:
+- Assume we have a pre-treatment confounder of $Y$ and $M$, denote it with $W$
+- For simplicity, assume $A$ is randomized
+- We'll generate a really large sample from a data generating mechanism so that
+  we are not concerned with sampling errors
+  ```{r}
+  n <- 1e6
+  w <- rnorm(n)
+  a <- rbinom(n, 1, 0.5)
+  z <- rbinom(n, 1, 0.2 * a + 0.3)
+  m <- rnorm(n, w + z)
+  y <- rnorm(n, m + w - a + z)
+  ```
+
+- Note that the indirect effect (i.e., the effect through $M$) in this example
+  is nonzero (there is a pathway $A \rightarrow Z \rightarrow M \rightarrow Y$)
+- Let's see what the product of coefficients method would say:
+  ```{r}
+  lm_y <- lm(y ~ m + a + w + z)
+  lm_m <- lm(m ~ a + w + z)
+  ## product of coefficients
+  coef(lm_y)[2] * coef(lm_m)[2]
+  ```
+
+Among other things, in this workshop:
+
+- We will provide some understanding for why the above method fails in this
+  example
+- We will study estimators that are robust to misspecification in the above
+  models
+
+## Causal mediation models
+
+In this workshop we will use directed acyclic graphs. We will focus on the two
+types of graph:
+
+### No intermediate confounders
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under *no intermediate confounders* of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+### Intermediate confounders
+
+```{tikz}
+#| echo: false
+#| fig-cap: Directed acyclic graph under intermediate confounders of the mediator-outcome relation affected by treatment
+\dimendef\prevdepth=0
+\pgfdeclarelayer{background}
+\pgfsetlayers{background,main}
+\usetikzlibrary{arrows,positioning}
+\tikzset{
+>=stealth',
+punkt/.style={
+rectangle,
+rounded corners,
+draw=black, very thick,
+text width=6.5em,
+minimum height=2em,
+text centered},
+pil/.style={
+->,
+thick,
+shorten <=2pt,
+shorten >=2pt,}
+}
+\newcommand{\Vertex}[2]
+{\node[minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\VertexR}[2]
+{\node[rectangle, draw, minimum width=0.6cm,inner sep=0.05cm] (#2) at (#1) {$#2$};
+}
+\newcommand{\ArrowR}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend right=30] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\ArrowL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) to[bend left=45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\EdgeL}[3]
+{ \begin{pgfonlayer}{background}
+\draw[dashed,#3] (#1) to[bend right=-45] (#2);
+\end{pgfonlayer}
+}
+\newcommand{\Arrow}[3]
+{ \begin{pgfonlayer}{background}
+\draw[->,#3] (#1) -- +(#2);
+\end{pgfonlayer}
+}
+\begin{tikzpicture}
+  \Vertex{0, -1}{Z}
+  \Vertex{-4, 0}{W}
+  \Vertex{0, 0}{M}
+  \Vertex{-2, 0}{A}
+  \Vertex{2, 0}{Y}
+  \ArrowR{W}{Z}{black}
+  \Arrow{Z}{M}{black}
+  \Arrow{W}{A}{black}
+  \Arrow{A}{M}{black}
+  \Arrow{M}{Y}{black}
+  \Arrow{A}{Z}{black}
+  \Arrow{Z}{Y}{black}
+  \ArrowL{W}{Y}{black}
+  \ArrowL{A}{Y}{black}
+  \ArrowL{W}{M}{black}
+\end{tikzpicture}
+```
+
+The above graphs can be interpreted as a _non-parametric structural equation
+model_ (NPSEM), also known as _structural causal model_ (SCM):
+
+$$ \begin{align}
+  W & = f_W(U_W) \nonumber \\
+  A & = f_A(W, U_A) \nonumber \\
+  Z & = f_Z(W, A, U_Z) \nonumber \\
+  M & = f_M(W, A, Z, U_M) \nonumber \\
+  Y & = f_Y(W, A, Z, M, U_Y)
+\end{align} $$
+
+- Here $U=(U_W, U_A, U_Z, U_M, U_Y)$ is a vector of all unmeasured exogenous
+  factors affecting the system
+- The functions $f$ are assumed fixed but unknown
+- We posit this model as a system of equations that nature uses to generate the
+  data
+- Therefore we leave the functions $f$ unspecified (i.e., we do not know the
+  true nature mechanisms)
+- Sometimes we know something: e.g., if $A$ is randomized we know $A=f_A(U_A)$
+  where $U_A$ is the flip of a coin (i.e., independent of everything).
+
+## Counterfactuals
+
+- Recall that we are interested in assessing how the pathways would behave under
+  circumstances different from the observed circumstances
+- We operationalize this idea using _counterfactual_ random variables
+- Counterfactuals are hypothetical random variables that would have been
+  observed in an alternative world where something had happened, possibly
+  contrary to fact <!--we would be able to perform interventions on the
+  random variables of interest-->
+
+### We will use the following counterfactual variables: {.unnumbered}
+
+- $Y_a$ is a counterfactual variable in a hypothetical world where $\P(A=a)=1$
+  with probability one for some value $a$
+- $Y_{a,m}$ is the counterfactual outcome in a world where $\P(A=a,M=m)=1$
+- $M_a$ is the counterfactual variable representing the mediator in a world
+  where $\P(A=a)=1$.
+
+### How are counterfactuals defined?
+
+<!-- - You can use counterfactual variables as _primitives_ -->
+- In the NPSEM framework, counterfactuals are quantities _derived_ from the
+  model.
+- Once you define a change to the causal system, that change needs to be
+  propagated downstream.
+  - Example: modifying the system to make everyone receive XR-NTX yields
+    counterfactual adherence, mediators, and outcomes.
+- Take as example the DAG in Figure 1.2:
+  $$ \begin{align}
+    A    &= a \nonumber \\
+    Z_a  &= f_Z(W, a, U_Z) \nonumber \\
+    M_a  &= f_M(W, a, Z_a, U_M) \nonumber \\
+    Y_a  &= f_Y(W, a, Z_a, M_a, U_Y)
+  \end{align} $$
+- We will also be interested in _joint changes_ to the system:
+  $$ \begin{align}
+    A        &= a \nonumber \\
+    Z_a      &= f_Z(W, a, U_Z) \nonumber \\
+    M        &= m \nonumber \\
+    Y_{a,m}  &= f_Y(W, a, Z_a, m, U_Y)
+  \end{align} $$
+- And, perhaps more importantly, we will use _nested counterfactuals_
+- For example, if $A$ is binary, you can think of the following counterfactual
+  $$ \begin{align}
+    A          &= 1 \nonumber \\
+    Z_1        &= f_Z(W, 1, U_Z) \nonumber \\
+    M          &= M_0 \nonumber \\
+    Y_{1, M_0} &= f_Y(W, 1, Z_1, M_0, U_Y)
+  \end{align} $$
+- $Y_{1, M_0}$ is interpreted as *the outcome for an individual in a
+  hypothetical world where treatment was given but the mediator was held at the
+  value it would have taken under no treatment*.
+- Causal mediation effects are often defined in terms of the distribution of
+  these nested counterfactuals.
+- That is, causal effects give you information about what would have happened
+  *in some hypothetical world* where the mediator and treatment mechanisms
+  changed.