Skip to content

Commit

Permalink
feat: Enable public version of app with courser data (#111)
Browse files Browse the repository at this point in the history
- Add option to summarise at quarterly level in preprocessing
- Handle quarterly summarised data in the app (kept it simple and just converted the quarters to the first month of that quarter and use that as input to the existing plotting function)
- Made the Shiny port configurable so we can run two instances simultaneously
- Added a `public.env.sample` file with placeholder configuration for the public version

Closes #105
  • Loading branch information
milanmlft authored Nov 21, 2024
1 parent 500fbbc commit f6c9091
Show file tree
Hide file tree
Showing 23 changed files with 254 additions and 73 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
**/renv
**/.Rprofile
**/.Renviron
17 changes: 10 additions & 7 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
ENV=test
DATA_VOLUME_PATH=./data/test_data
DATA_VOLUME_PATH=./data/test_data/internal

SHINY_PORT: 3838

# For preprocessing
PREPROCESS_DB_NAME= # name of the source database
PREPROCESS_HOST= # host address for the source database
PREPROCESS_PORT= # port on which to connect to the source database
PREPROCESS_DB_USERNAME= # username for the source database
PREPROCESS_DB_PASSWORD= # password for the source database
PREPROCESS_DB_CDM_SCHEMA= # Schema name in the database to connect the OMOP CDM to
PREPROCESS_DB_NAME= # name of the source database
PREPROCESS_HOST= # host address for the source database
PREPROCESS_PORT= # port on which to connect to the source database
PREPROCESS_DB_USERNAME= # username for the source database
PREPROCESS_DB_PASSWORD= # password for the source database
PREPROCESS_DB_CDM_SCHEMA= # Schema name in the database to connect the OMOP CDM to
PREPROCESS_SUMMARISE_LEVEL=monthly # Level to summarise record counts at (monthly or quarterly)

# Low-frequency replacement
LOW_FREQUENCY_THRESHOLD=5
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ po/*~
rsconnect/

!dev
.env
*.env
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,18 @@ docker compose up -d omopcat

By default, the app will be hosted at `http://localhost:3838`.

See the [deployment docs](./deploy/README.md) for more details.
### Public version

Copy the `public.env.sample` to `public.env` and fill out the necessary environment variables.

Then run the pre-processing pipeline with

```sh
docker compose --env-file public.env run preprocess
```

and deploy the app with

```sh
docker compose --env-file public.env -p omopcat-public up -d omopcat
```
9 changes: 4 additions & 5 deletions app/R/fct_monthly_count_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ monthly_count_plot <- function(monthly_counts) {
stopifnot(is.data.frame(monthly_counts))
stopifnot(all(c("date_year", "date_month", "person_count") %in% colnames(monthly_counts)))

monthly_counts$date <- .convert_to_date(monthly_counts$date_year, monthly_counts$date_month)
monthly_counts$date <- lubridate::make_date(
year = monthly_counts$date_year,
month = monthly_counts$date_month
)

ggplot(monthly_counts, aes(x = .data$date, y = .data$record_count)) +
geom_bar(aes(fill = .data$concept_name), stat = "identity") +
Expand All @@ -26,7 +29,3 @@ monthly_count_plot <- function(monthly_counts) {
scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
}

.convert_to_date <- function(date_year, date_month) {
as.Date(paste0(date_year, "-", date_month, "-01"))
}
10 changes: 10 additions & 0 deletions app/R/utils_get_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,19 @@ get_monthly_counts <- function() {
} else {
data <- .read_parquet_table("omopcat_monthly_counts")
}

if ("date_quarter" %in% colnames(data)) {
## Set `date_month` to the first month of the quarter
data$date_month <- .quarter_to_month(data$date_quarter)
}

return(data)
}

.quarter_to_month <- function(quarter) {
return((quarter - 1) * 3 + 1)
}

get_summary_stats <- function() {
if (should_use_dev_data()) {
out <- readr::read_csv(
Expand Down
Binary file added data/test_data/internal/omopcat_concepts.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed data/test_data/omopcat_concepts.parquet
Binary file not shown.
Binary file removed data/test_data/omopcat_monthly_counts.parquet
Binary file not shown.
Binary file removed data/test_data/omopcat_summary_stats.parquet
Binary file not shown.
Binary file added data/test_data/public/omopcat_concepts.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ services:
EUNOMIA_DATA_FOLDER: /mnt/preprocessing/data-raw/test_db
LOW_FREQUENCY_THRESHOLD: ${LOW_FREQUENCY_THRESHOLD}
LOW_FREQUENCY_REPLACEMENT: ${LOW_FREQUENCY_REPLACEMENT}
SUMMARISE_LEVEL: ${PREPROCESS_SUMMARISE_LEVEL}
command: ["R", "-e", "omopcat.preprocessing::preprocess()"]
volumes:
- ${DATA_VOLUME_PATH}:/mnt/preprocessing/data
Expand Down Expand Up @@ -49,4 +50,4 @@ services:
volumes:
- ${DATA_VOLUME_PATH}:/etc/omopcat/data
ports:
- 3838:3838
- ${SHINY_PORT}:3838
97 changes: 69 additions & 28 deletions preprocessing/R/monthly_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,57 @@
#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
#' @param threshold Threshold value below which values will be replaced by `replacement`
#' @param replacement Value with which values below `threshold` will be replaced
#' @param level At which resolution the counts should be summarised.
#' Currently supports `"monthly"` or `"quarterly"`.
#'
#' @return A `data.frame` with the monthly counts
#' @keywords internal
generate_monthly_counts <- function(cdm, threshold, replacement) {
generate_monthly_counts <- function(cdm, threshold, replacement,
level = c("monthly", "quarterly")) {
level <- match.arg(level)
.summarise <- function(...) summarise_counts(..., level = level)

# Combine results for all tables
out <- dplyr::bind_rows(
cdm$condition_occurrence |> calculate_monthly_counts(
.data$condition_concept_id, .data$condition_start_date
arg_list <- list(
list(
omop_table = cdm[["measurement"]],
concept_col = "measurement_concept_id",
date_col = "measurement_date"
),
list(
omop_table = cdm[["observation"]],
concept_col = "observation_concept_id",
date_col = "observation_date"
),
list(
omop_table = cdm[["condition_occurrence"]],
concept_col = "condition_concept_id",
date_col = "condition_start_date"
),
list(
omop_table = cdm[["drug_exposure"]],
concept_col = "drug_concept_id",
date_col = "drug_exposure_start_date"
),
list(
omop_table = cdm[["procedure_occurrence"]],
concept_col = "procedure_concept_id",
date_col = "procedure_date"
),
list(
omop_table = cdm[["device_exposure"]],
concept_col = "device_concept_id",
date_col = "device_exposure_start_date"
),
cdm$drug_exposure |>
calculate_monthly_counts(.data$drug_concept_id, .data$drug_exposure_start_date),
cdm$procedure_occurrence |>
calculate_monthly_counts(.data$procedure_concept_id, .data$procedure_date),
cdm$device_exposure |>
calculate_monthly_counts(.data$device_concept_id, .data$device_exposure_start_date),
cdm$measurement |>
calculate_monthly_counts(.data$measurement_concept_id, .data$measurement_date),
cdm$observation |>
calculate_monthly_counts(.data$observation_concept_id, .data$observation_date),
cdm$specimen |>
calculate_monthly_counts(.data$specimen_concept_id, .data$specimen_date)
list(
omop_table = cdm[["specimen"]],
concept_col = "specimen_concept_id",
date_col = "specimen_date"
)
)

out <- purrr::map_dfr(arg_list, function(args) do.call(.summarise, args))

# Map concept names to the concept IDs
concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |>
dplyr::filter(.data$concept_id %in% out$concept_id) |>
Expand All @@ -39,31 +67,44 @@ generate_monthly_counts <- function(cdm, threshold, replacement) {
)
}


#' Calculate monthly statistics for an OMOP concept
#' Summarise record counts
#'
#' @param omop_table A table from the OMOP CDM
#' @param concept The name of the concept column to calculate statistics for
#' @param date The name of the date column to calculate statistics for
#' @param concept_col The name of the concept column to calculate statistics for
#' @param date_col The name of the date column to calculate statistics for
#' @param level The resolution at which to summarise the record counts.
#' Currently supports `"monthly"` or `"quarterly"`
#'
#' @return A `data.frame` with the following columns:
#' - `concept_id`: The concept ID
#' - `concept_name`: The concept name
#' - `date_year`: The year of the date
#' - `date_month`: The month of the date
#' - `date_month` or `date_quarter`: The month or quarter of the date, depending on `level`
#' - `person_count`: The number of unique patients per concept for each month
#' - `records_per_person`: The average number of records per person per concept for each month
#' @keywords internal
calculate_monthly_counts <- function(omop_table, concept, date) {
# Extract year and month from date column
summarise_counts <- function(omop_table, concept_col, date_col, level) {
group_by_var <- switch(level,
monthly = "date_month",
quarterly = "date_quarter",
stop(sprintf("Summary level `%s` not supported", level))
)

# Extract year, month and quarter from date column
omop_table <- dplyr::mutate(omop_table,
concept_id = {{ concept }},
date_year = as.integer(lubridate::year({{ date }})),
date_month = as.integer(lubridate::month({{ date }}))
concept_id = .data[[concept_col]],
date_year = as.integer(lubridate::year(.data[[date_col]])),
date_month = as.integer(lubridate::month(.data[[date_col]]))
)

if (level == "quarterly") {
# NOTE: lubridate::quarter is not supported for all SQL back-ends
omop_table <- omop_table |>
dplyr::mutate(date_quarter = as.integer(lubridate::quarter(.data[[date_col]])))
}

omop_table |>
dplyr::group_by(.data$date_year, .data$date_month, .data$concept_id) |>
dplyr::group_by(.data$date_year, .data[[group_by_var]], .data$concept_id) |>
dplyr::summarise(
record_count = dplyr::n(),
person_count = dplyr::n_distinct(.data$person_id),
Expand All @@ -79,7 +120,7 @@ calculate_monthly_counts <- function(omop_table, concept, date) {
dplyr::select(
"concept_id",
"date_year",
"date_month",
dplyr::all_of(group_by_var),
"record_count",
"person_count",
"records_per_person"
Expand Down
11 changes: 9 additions & 2 deletions preprocessing/R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,15 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {

threshold <- Sys.getenv("LOW_FREQUENCY_THRESHOLD")
replacement <- Sys.getenv("LOW_FREQUENCY_REPLACEMENT")
summarise_level <- Sys.getenv("SUMMARISE_LEVEL")

cli::cli_alert_info("Summarising record counts at the '{summarise_level}' level")

cli::cli_progress_message("Generating monthly_counts table")
monthly_counts <- generate_monthly_counts(cdm, threshold = threshold, replacement = replacement)
monthly_counts <- generate_monthly_counts(cdm,
threshold = threshold, replacement = replacement,
level = summarise_level
)

cli::cli_progress_message("Generating summary_stats table")
summary_stats <- generate_summary_stats(cdm, threshold = threshold, replacement = replacement)
Expand Down Expand Up @@ -79,7 +85,8 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
"DB_USERNAME",
"DB_PASSWORD",
"LOW_FREQUENCY_THRESHOLD",
"LOW_FREQUENCY_REPLACEMENT"
"LOW_FREQUENCY_REPLACEMENT",
"SUMMARISE_LEVEL"
)

missing <- Sys.getenv(required_envvars) == ""
Expand Down
10 changes: 9 additions & 1 deletion preprocessing/man/generate_monthly_counts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f6c9091

Please sign in to comment.