feat: Enable public version of app with courser data (#111)

- Add option to summarise at quarterly level in preprocessing - Handle quarterly summarised data in the app (kept it simple and just converted the quarters to the first month of that quarter and use that as input to the existing plotting function) - Made the Shiny port configurable so we can run two instances simultaneously - Added a `public.env.sample` file with placeholder configuration for the public version Closes #105
SAFEHR-data · Nov 21, 2024 · f6c9091 · f6c9091
1 parent 500fbbc
commit f6c9091
Show file tree

Hide file tree

Showing 23 changed files with 254 additions and 73 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,3 @@
 **/renv
 **/.Rprofile
+**/.Renviron
diff --git a/.env.sample b/.env.sample
@@ -1,13 +1,16 @@
 ENV=test
-DATA_VOLUME_PATH=./data/test_data
+DATA_VOLUME_PATH=./data/test_data/internal
+
+SHINY_PORT: 3838
 
 # For preprocessing
-PREPROCESS_DB_NAME=         # name of the source database
-PREPROCESS_HOST=            # host address for the source database
-PREPROCESS_PORT=            # port on which to connect to the source database
-PREPROCESS_DB_USERNAME=     # username for the source database
-PREPROCESS_DB_PASSWORD=     # password for the source database
-PREPROCESS_DB_CDM_SCHEMA=   # Schema name in the database to connect the OMOP CDM to 
+PREPROCESS_DB_NAME=                 # name of the source database
+PREPROCESS_HOST=                    # host address for the source database
+PREPROCESS_PORT=                    # port on which to connect to the source database
+PREPROCESS_DB_USERNAME=             # username for the source database
+PREPROCESS_DB_PASSWORD=             # password for the source database
+PREPROCESS_DB_CDM_SCHEMA=           # Schema name in the database to connect the OMOP CDM to
+PREPROCESS_SUMMARISE_LEVEL=monthly  # Level to summarise record counts at (monthly or quarterly)
 
 # Low-frequency replacement
 LOW_FREQUENCY_THRESHOLD=5

diff --git a/.gitignore b/.gitignore
@@ -49,4 +49,4 @@ po/*~
 rsconnect/
 
 !dev
-.env
+*.env
diff --git a/README.md b/README.md
@@ -100,4 +100,18 @@ docker compose up -d omopcat
 
 By default, the app will be hosted at `http://localhost:3838`.
 
-See the [deployment docs](./deploy/README.md) for more details.
+### Public version
+
+Copy the `public.env.sample` to `public.env` and fill out the necessary environment variables.
+
+Then run the pre-processing pipeline with
+
+```sh
+docker compose --env-file public.env run preprocess
+```
+
+and deploy the app with
+
+```sh
+docker compose --env-file public.env -p omopcat-public up -d omopcat
+```
diff --git a/app/R/fct_monthly_count_plot.R b/app/R/fct_monthly_count_plot.R
@@ -16,7 +16,10 @@ monthly_count_plot <- function(monthly_counts) {
   stopifnot(is.data.frame(monthly_counts))
   stopifnot(all(c("date_year", "date_month", "person_count") %in% colnames(monthly_counts)))
 
-  monthly_counts$date <- .convert_to_date(monthly_counts$date_year, monthly_counts$date_month)
+  monthly_counts$date <- lubridate::make_date(
+    year = monthly_counts$date_year,
+    month = monthly_counts$date_month
+  )
 
   ggplot(monthly_counts, aes(x = .data$date, y = .data$record_count)) +
     geom_bar(aes(fill = .data$concept_name), stat = "identity") +
@@ -26,7 +29,3 @@ monthly_count_plot <- function(monthly_counts) {
     scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
     theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
 }
-
-.convert_to_date <- function(date_year, date_month) {
-  as.Date(paste0(date_year, "-", date_month, "-01"))
-}
diff --git a/app/R/utils_get_data.R b/app/R/utils_get_data.R
@@ -44,9 +44,19 @@ get_monthly_counts <- function() {
   } else {
     data <- .read_parquet_table("omopcat_monthly_counts")
   }
+
+  if ("date_quarter" %in% colnames(data)) {
+    ## Set `date_month` to the first month of the quarter
+    data$date_month <- .quarter_to_month(data$date_quarter)
+  }
+
   return(data)
 }
 
+.quarter_to_month <- function(quarter) {
+  return((quarter - 1) * 3 + 1)
+}
+
 get_summary_stats <- function() {
   if (should_use_dev_data()) {
     out <- readr::read_csv(

diff --git a/data/test_data/internal/omopcat_concepts.parquet b/data/test_data/internal/omopcat_concepts.parquet
diff --git a/data/test_data/internal/omopcat_monthly_counts.parquet b/data/test_data/internal/omopcat_monthly_counts.parquet
diff --git a/data/test_data/internal/omopcat_summary_stats.parquet b/data/test_data/internal/omopcat_summary_stats.parquet
diff --git a/data/test_data/omopcat_concepts.parquet b/data/test_data/omopcat_concepts.parquet
diff --git a/data/test_data/omopcat_monthly_counts.parquet b/data/test_data/omopcat_monthly_counts.parquet
diff --git a/data/test_data/omopcat_summary_stats.parquet b/data/test_data/omopcat_summary_stats.parquet
diff --git a/data/test_data/public/omopcat_concepts.parquet b/data/test_data/public/omopcat_concepts.parquet
diff --git a/data/test_data/public/omopcat_monthly_counts.parquet b/data/test_data/public/omopcat_monthly_counts.parquet
diff --git a/data/test_data/public/omopcat_summary_stats.parquet b/data/test_data/public/omopcat_summary_stats.parquet
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -21,6 +21,7 @@ services:
       EUNOMIA_DATA_FOLDER: /mnt/preprocessing/data-raw/test_db
       LOW_FREQUENCY_THRESHOLD: ${LOW_FREQUENCY_THRESHOLD}
       LOW_FREQUENCY_REPLACEMENT: ${LOW_FREQUENCY_REPLACEMENT}
+      SUMMARISE_LEVEL: ${PREPROCESS_SUMMARISE_LEVEL}
     command: ["R", "-e", "omopcat.preprocessing::preprocess()"]
     volumes:
       - ${DATA_VOLUME_PATH}:/mnt/preprocessing/data
@@ -49,4 +50,4 @@ services:
     volumes:
       - ${DATA_VOLUME_PATH}:/etc/omopcat/data
     ports:
-      - 3838:3838
+      - ${SHINY_PORT}:3838
diff --git a/preprocessing/R/monthly_counts.R b/preprocessing/R/monthly_counts.R
@@ -3,29 +3,57 @@
 #' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
 #' @param threshold Threshold value below which values will be replaced by `replacement`
 #' @param replacement Value with which values below `threshold` will be replaced
+#' @param level At which resolution the counts should be summarised.
+#'    Currently supports `"monthly"` or `"quarterly"`.
 #'
 #' @return A `data.frame` with the monthly counts
 #' @keywords internal
-generate_monthly_counts <- function(cdm, threshold, replacement) {
+generate_monthly_counts <- function(cdm, threshold, replacement,
+                                    level = c("monthly", "quarterly")) {
+  level <- match.arg(level)
+  .summarise <- function(...) summarise_counts(..., level = level)
+
   # Combine results for all tables
-  out <- dplyr::bind_rows(
-    cdm$condition_occurrence |> calculate_monthly_counts(
-      .data$condition_concept_id, .data$condition_start_date
+  arg_list <- list(
+    list(
+      omop_table = cdm[["measurement"]],
+      concept_col = "measurement_concept_id",
+      date_col = "measurement_date"
+    ),
+    list(
+      omop_table = cdm[["observation"]],
+      concept_col = "observation_concept_id",
+      date_col = "observation_date"
+    ),
+    list(
+      omop_table = cdm[["condition_occurrence"]],
+      concept_col = "condition_concept_id",
+      date_col = "condition_start_date"
+    ),
+    list(
+      omop_table = cdm[["drug_exposure"]],
+      concept_col = "drug_concept_id",
+      date_col = "drug_exposure_start_date"
+    ),
+    list(
+      omop_table = cdm[["procedure_occurrence"]],
+      concept_col = "procedure_concept_id",
+      date_col = "procedure_date"
+    ),
+    list(
+      omop_table = cdm[["device_exposure"]],
+      concept_col = "device_concept_id",
+      date_col = "device_exposure_start_date"
     ),
-    cdm$drug_exposure |>
-      calculate_monthly_counts(.data$drug_concept_id, .data$drug_exposure_start_date),
-    cdm$procedure_occurrence |>
-      calculate_monthly_counts(.data$procedure_concept_id, .data$procedure_date),
-    cdm$device_exposure |>
-      calculate_monthly_counts(.data$device_concept_id, .data$device_exposure_start_date),
-    cdm$measurement |>
-      calculate_monthly_counts(.data$measurement_concept_id, .data$measurement_date),
-    cdm$observation |>
-      calculate_monthly_counts(.data$observation_concept_id, .data$observation_date),
-    cdm$specimen |>
-      calculate_monthly_counts(.data$specimen_concept_id, .data$specimen_date)
+    list(
+      omop_table = cdm[["specimen"]],
+      concept_col = "specimen_concept_id",
+      date_col = "specimen_date"
+    )
   )
 
+  out <- purrr::map_dfr(arg_list, function(args) do.call(.summarise, args))
+
   # Map concept names to the concept IDs
   concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |>
     dplyr::filter(.data$concept_id %in% out$concept_id) |>
@@ -39,31 +67,44 @@ generate_monthly_counts <- function(cdm, threshold, replacement) {
     )
 }
 
-
-#' Calculate monthly statistics for an OMOP concept
+#' Summarise record counts
 #'
 #' @param omop_table A table from the OMOP CDM
-#' @param concept The name of the concept column to calculate statistics for
-#' @param date The name of the date column to calculate statistics for
+#' @param concept_col The name of the concept column to calculate statistics for
+#' @param date_col The name of the date column to calculate statistics for
+#' @param level The resolution at which to summarise the record counts.
+#'    Currently supports `"monthly"` or `"quarterly"`
 #'
 #' @return A `data.frame` with the following columns:
 #'   - `concept_id`: The concept ID
 #'   - `concept_name`: The concept name
 #'   - `date_year`: The year of the date
-#'   - `date_month`: The month of the date
+#'   - `date_month` or `date_quarter`: The month or quarter of the date, depending on `level`
 #'   - `person_count`: The number of unique patients per concept for each month
 #'   - `records_per_person`: The average number of records per person per concept for each month
 #' @keywords internal
-calculate_monthly_counts <- function(omop_table, concept, date) {
-  # Extract year and month from date column
+summarise_counts <- function(omop_table, concept_col, date_col, level) {
+  group_by_var <- switch(level,
+    monthly = "date_month",
+    quarterly = "date_quarter",
+    stop(sprintf("Summary level `%s` not supported", level))
+  )
+
+  # Extract year, month and quarter from date column
   omop_table <- dplyr::mutate(omop_table,
-    concept_id = {{ concept }},
-    date_year = as.integer(lubridate::year({{ date }})),
-    date_month = as.integer(lubridate::month({{ date }}))
+    concept_id = .data[[concept_col]],
+    date_year = as.integer(lubridate::year(.data[[date_col]])),
+    date_month = as.integer(lubridate::month(.data[[date_col]]))
   )
 
+  if (level == "quarterly") {
+    # NOTE: lubridate::quarter is not supported for all SQL back-ends
+    omop_table <- omop_table |>
+      dplyr::mutate(date_quarter = as.integer(lubridate::quarter(.data[[date_col]])))
+  }
+
   omop_table |>
-    dplyr::group_by(.data$date_year, .data$date_month, .data$concept_id) |>
+    dplyr::group_by(.data$date_year, .data[[group_by_var]], .data$concept_id) |>
     dplyr::summarise(
       record_count = dplyr::n(),
       person_count = dplyr::n_distinct(.data$person_id),
@@ -79,7 +120,7 @@ calculate_monthly_counts <- function(omop_table, concept, date) {
     dplyr::select(
       "concept_id",
       "date_year",
-      "date_month",
+      dplyr::all_of(group_by_var),
       "record_count",
       "person_count",
       "records_per_person"

diff --git a/preprocessing/R/preprocess.R b/preprocessing/R/preprocess.R
@@ -48,9 +48,15 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
 
   threshold <- Sys.getenv("LOW_FREQUENCY_THRESHOLD")
   replacement <- Sys.getenv("LOW_FREQUENCY_REPLACEMENT")
+  summarise_level <- Sys.getenv("SUMMARISE_LEVEL")
+
+  cli::cli_alert_info("Summarising record counts at the '{summarise_level}' level")
 
   cli::cli_progress_message("Generating monthly_counts table")
-  monthly_counts <- generate_monthly_counts(cdm, threshold = threshold, replacement = replacement)
+  monthly_counts <- generate_monthly_counts(cdm,
+    threshold = threshold, replacement = replacement,
+    level = summarise_level
+  )
 
   cli::cli_progress_message("Generating summary_stats table")
   summary_stats <- generate_summary_stats(cdm, threshold = threshold, replacement = replacement)
@@ -79,7 +85,8 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
     "DB_USERNAME",
     "DB_PASSWORD",
     "LOW_FREQUENCY_THRESHOLD",
-    "LOW_FREQUENCY_REPLACEMENT"
+    "LOW_FREQUENCY_REPLACEMENT",
+    "SUMMARISE_LEVEL"
   )
 
   missing <- Sys.getenv(required_envvars) == ""

diff --git a/preprocessing/man/generate_monthly_counts.Rd b/preprocessing/man/generate_monthly_counts.Rd
diff --git a/...rocessing/man/calculate_monthly_counts.Rd → preprocessing/man/summarise_counts.Rd b/...rocessing/man/calculate_monthly_counts.Rd → preprocessing/man/summarise_counts.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,4 +49,4 @@ po/*~ @@
     rsconnect/
     !dev
-    .env
+    *.env