From d718a9731ef3708b80d2470d2ea392fe82e31407 Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Wed, 17 Jul 2024 10:12:35 -0600 Subject: [PATCH] updated docs to use markdown and refactored the validation_checks code --- DESCRIPTION | 3 +- R/class_to_col_type.R | 4 +- R/validation_checks.R | 78 ++++++++++--------- R/zzz.R | 4 +- inst/{test_pk.xlsx => test_primary_key.xlsx} | Bin inst/test_val_log_data.csv | 10 +++ inst/test_validation_log.csv | 12 +++ man/check_id_existence.Rd | 4 +- man/class_to_col_type.Rd | 16 ++-- man/create_freetext_log.Rd | 14 ++-- man/detect_language.Rd | 2 +- man/download_dropbox.Rd | 2 +- man/download_googledrive_files.Rd | 4 +- man/dropbox_upload.Rd | 2 +- man/get_odk_form_schema.Rd | 4 +- man/get_odk_responses.Rd | 6 +- man/guess_col_type.Rd | 2 +- man/othertext_lookup.Rd | 4 +- man/read_excel_all_sheets.Rd | 2 +- man/read_googlesheets.Rd | 4 +- man/remove_deletions.Rd | 6 +- man/validation_checks.Rd | 44 ++++++----- 22 files changed, 132 insertions(+), 95 deletions(-) rename inst/{test_pk.xlsx => test_primary_key.xlsx} (100%) create mode 100644 inst/test_val_log_data.csv create mode 100644 inst/test_validation_log.csv diff --git a/DESCRIPTION b/DESCRIPTION index deee6df..cb4e32a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ohcleandat Type: Package Title: One Health Data Cleaning and Quality Checking Package -Version: 0.2.4 +Version: 0.2.5 Authors@R: c( person("Collin", "Schwantes", email = "schwantes@ecohealthalliance.org", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")), person("Johana", "Teigen", email = "teigen@ecohealthalliance.org", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")), @@ -46,3 +46,4 @@ Remotes: URL: https://ecohealthalliance.github.io/ohcleandat/ Depends: R (>= 2.10) +Roxygen: list(markdown = TRUE) diff --git a/R/class_to_col_type.R b/R/class_to_col_type.R index dee8b56..64164e1 100644 --- a/R/class_to_col_type.R +++ b/R/class_to_col_type.R @@ -11,8 +11,8 @@ #' \describe{ #' \item{col_type}{Type of column as described in `readr`} #' \item{col_class}{Class of R object that matches that column type} -#' \item{col_abv}{Abbreviation for that column type from `reader`} +#' \item{col_abv}{Abbreviation for that column type from `readr`} #' ... #' } -#' @seealso [reader::cols()] +#' @seealso [readr::cols()] "class_to_col_type" diff --git a/R/validation_checks.R b/R/validation_checks.R index 5d21e5b..b09675a 100644 --- a/R/validation_checks.R +++ b/R/validation_checks.R @@ -9,23 +9,20 @@ #' corrections were made as expected, some checks are performed in this function. #' #' 1. If no existing log exists > no changes are make to data -#' - Same variables -#' - same Rows -#' - No unequal values -#' +#' * Same variables +#' * same Rows +#' * No unequal values #' 2. If log exists but no changes are recommended > no changes to data. -#' - Same variables -#' - same Rows -#' - No unequal values -#' +#' * Same variables +#' * same Rows +#' * No unequal values #' 3. Log exists and changes recommended > number of changes are same as log -#' - Same variables -#' - same Rows -#' - Number of changing records in data match records in log -#' +#' * Same variables +#' * same Rows +#' * Number of changing records in data match records in log #' 4. Correct fields and records are being updated -#' - Checks before and after variables and rows are the same -#' - Checks the variable names and row indexes are the same in the logs and the changed data. +#' * Checks before and after variables and rows are the same +#' * Checks the variable names and row indexes are the same in the logs and the changed data. #' #' @param validation_log tibble Validation log #' @param before_data tibble Data before corrections @@ -50,8 +47,8 @@ validation_checks <- after_data, idcol) { if (!is.null(validation_log)) { - # calculate number of assumed changes from log - changes <- validation_log |> + # preprocess the log + preprocess_log <- validation_log |> dplyr::filter( is_valid == "FALSE" | is_valid == "F", !is.na(field), @@ -60,30 +57,37 @@ validation_checks <- entry != "", new_val != "" ) |> + dplyr::mutate("entry_field" = paste(entry,field,sep = "_")) |> + dplyr::mutate("entry_field_dupe" = duplicated(entry_field, + fromLast = TRUE)) # check for duplicate entry-field combos. + + ## warning message for duplicate field and entry items + if(any(!preprocess_log$entry_field_dupe)){ + rlang::warn("Detected duplicate entry-field combination. The same item has been corrected at least twice in the log") + } + + ## message about reversions + if(any(preprocess_log$old_value == preprocess_log$new_val)){ + rlang::inform("Reversion to the an original value detected in the log.") + } + + # drop duplicate or reversion changes + validation_log_filtered <- preprocess_log|> + dplyr::filter( + # keep the last entry-field item for any repeated entry-field combos + !entry_field_dupe, + ## remove any changes that are reversions in the original value + new_val != old_value) + + expected_changes <- validation_log_filtered |> dplyr::summarise(n = dplyr::n()) |> dplyr::pull(n) - val_fields <- validation_log |> - dplyr::filter( - is_valid == "FALSE" | is_valid == "F", - !is.na(field), - field != "", - !is.na(entry), - entry != "", - new_val != "" - ) |> + val_fields <- validation_log_filtered |> dplyr::pull(field) |> unique() - val_recs <- validation_log |> - dplyr::filter( - is_valid == "FALSE" | is_valid == "F", - !is.na(field), - field != "", - !is.na(entry), - entry != "", - new_val != "" - ) |> + val_recs <- validation_log_filtered |> dplyr::pull(entry) |> unique() @@ -91,12 +95,12 @@ validation_checks <- which(dplyr::pull(before_data[idcol]) %in% val_recs) } - # perform dataframe comparison + # perform dataframe comparison ---- cd <- arsenal::comparedf(before_data, after_data) s <- summary(cd) - ### TESTS + ### TESTS ---- # TEST: If no existing log exists > no changes are make to data # same vars @@ -133,7 +137,7 @@ validation_checks <- # number of changing records in data match records in log test3 <- if (!is.null(validation_log) & NROW(validation_log) > 0) { all( - s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == changes, + s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == expected_changes, s$frame.summary.table$ncol[1] == s$frame.summary.table$ncol[2], s$frame.summary.table$nrow[1] == s$frame.summary.table$nrow[2] ) diff --git a/R/zzz.R b/R/zzz.R index 6d9fb4c..dafdbcd 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -28,6 +28,8 @@ utils::globalVariables( "log_response_id", "n", "rowid", - "set_diff" + "set_diff", + "entry_field", + "entry_field_dupe" ) ) diff --git a/inst/test_pk.xlsx b/inst/test_primary_key.xlsx similarity index 100% rename from inst/test_pk.xlsx rename to inst/test_primary_key.xlsx diff --git a/inst/test_val_log_data.csv b/inst/test_val_log_data.csv new file mode 100644 index 0000000..f3bd0c3 --- /dev/null +++ b/inst/test_val_log_data.csv @@ -0,0 +1,10 @@ +ID,animal_id +id_1,abc-123 +id_2,abc-130 +id_3,abc-125 +id_4,abc-126 +id_5,abc-127 +id_6,abc-128 +id_7,abc-129 +id_9,abc-131 +id_10,ABC-132 \ No newline at end of file diff --git a/inst/test_validation_log.csv b/inst/test_validation_log.csv new file mode 100644 index 0000000..2d9907a --- /dev/null +++ b/inst/test_validation_log.csv @@ -0,0 +1,12 @@ +rowid,log_response_id,entry,field,issue,old_value,is_valid,new_val,user_initials,comments +1,23,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries +2,51,id_2,animal_id,Results: animal_id (ID Number) improperly formatted,abc-130,F,ABC-130,, +3,11,id_3,animal_id,Results: animal_id (ID Number) improperly formatted,abc-125,F,ABC-125,, +4,39,id_4,animal_id,Results: animal_id (ID Number) improperly formatted,abc-126,F,ABC-126,, +5,12,id_5,animal_id,Results: animal_id (ID Number) improperly formatted,abc-127,F,ABC-127,, +6,40,id_6,animal_id,Results: animal_id (ID Number) improperly formatted,abc-128,F,ABC-128,, +7,13,id_7,animal_id,Results: animal_id (ID Number) improperly formatted,abc-129,F,ABC-129,, +8,41,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries +9,5,id_9,animal_id,Results: animal_id (ID Number) improperly formatted,abc-131,F,ABC-131,, +10,33,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,abc-132,,For testing reversions +11,34,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,ABC-132,,For testing reversions \ No newline at end of file diff --git a/man/check_id_existence.Rd b/man/check_id_existence.Rd index 16908f3..59b3f57 100644 --- a/man/check_id_existence.Rd +++ b/man/check_id_existence.Rd @@ -22,7 +22,7 @@ tibble rows from x without a match in y } \description{ This returns rows in x without a match in y. Returning selected columns only. It -is a this wrapper around `dplyr::anti_join`. +is a this wrapper around \code{dplyr::anti_join}. } \examples{ \dontrun{ @@ -33,5 +33,5 @@ check_id_existence(x, } } \seealso{ -`dplyr::anti_join` +\code{dplyr::anti_join} } diff --git a/man/class_to_col_type.Rd b/man/class_to_col_type.Rd index c93e0dc..04b4307 100644 --- a/man/class_to_col_type.Rd +++ b/man/class_to_col_type.Rd @@ -5,20 +5,22 @@ \alias{class_to_col_type} \title{Class to Column Type lookup table} \format{ -## `class_to_col_type` +\subsection{\code{class_to_col_type}}{ + A data frame with 9 rows and 3 columns: \describe{ - \item{col_type}{Type of column as described in `readr`} - \item{col_class}{Class of R object that matches that column type} - \item{col_abv}{Abbreviation for that column type from `reader`} - ... +\item{col_type}{Type of column as described in \code{readr}} +\item{col_class}{Class of R object that matches that column type} +\item{col_abv}{Abbreviation for that column type from \code{readr}} +... +} } } \usage{ class_to_col_type } \description{ -A table that links classes to `readr` column types. +A table that links classes to \code{readr} column types. Created from csv file of the same name in inst/ } \details{ @@ -26,6 +28,6 @@ class_to_col_type <- read.csv(file = "inst/class_to_col_type.csv") usethis::use_data(class_to_col_type,overwrite = TRUE) } \seealso{ -[reader::cols()] +\code{\link[readr:cols]{readr::cols()}} } \keyword{datasets} diff --git a/man/create_freetext_log.Rd b/man/create_freetext_log.Rd index 28355da..1cfda34 100644 --- a/man/create_freetext_log.Rd +++ b/man/create_freetext_log.Rd @@ -14,7 +14,7 @@ create_freetext_log(response_data, form_schema, url, lookup) \item{url}{The ODK submission URL excluding the uuid identifier} \item{lookup}{a tibble formatted as a lookup to match questions with their free text responses. The format must match -the output of `othertext_lookup()`. This function can be passed to this function argument as a convenient handler for this value.} +the output of \code{othertext_lookup()}. This function can be passed to this function argument as a convenient handler for this value.} } \value{ data.frame validation log @@ -25,11 +25,11 @@ multi-choice options. } \details{ This function needs to link a survey question with its corresponding free text response. Users can use the -`othertext_lookup()` function to handle this, or provide their own tibble in the same format. See below: - tibble::tribble( - ~name, ~other_name, - question_1, question_1_other - ) +\code{othertext_lookup()} function to handle this, or provide their own tibble in the same format. See below: +tibble::tribble( +~name, ~other_name, +question_1, question_1_other +) } \examples{ \dontrun{ @@ -55,5 +55,5 @@ mylookup <- tibble::tribble( } \seealso{ -[ohcleandat::othertext_lookup()] +\code{\link[=othertext_lookup]{othertext_lookup()}} } diff --git a/man/detect_language.Rd b/man/detect_language.Rd index 48466e9..b3c362e 100644 --- a/man/detect_language.Rd +++ b/man/detect_language.Rd @@ -23,5 +23,5 @@ detect_language(text = "buongiorno") } \seealso{ -[stringi::stri_enc_detect()] +\code{\link[stringi:stri_enc_detect]{stringi::stri_enc_detect()}} } diff --git a/man/download_dropbox.Rd b/man/download_dropbox.Rd index a5346bc..c2ad9b1 100644 --- a/man/download_dropbox.Rd +++ b/man/download_dropbox.Rd @@ -31,5 +31,5 @@ Downloads files from dropbox into a given directory } \seealso{ -[rdrop2::drop_download()] +\code{\link[rdrop2:drop_download]{rdrop2::drop_download()}} } diff --git a/man/download_googledrive_files.Rd b/man/download_googledrive_files.Rd index e9b4bb4..3ae687a 100644 --- a/man/download_googledrive_files.Rd +++ b/man/download_googledrive_files.Rd @@ -31,7 +31,7 @@ For a given Google Drive folder this function will find and download all files matching a given pattern. } \details{ -Note: This relies on the `googledrive::drive_ls()` function which uses a search function +Note: This relies on the \code{googledrive::drive_ls()} function which uses a search function and is not deterministic when recursively searching. Please pay attention to what is returned. } \examples{ @@ -46,5 +46,5 @@ and is not deterministic when recursively searching. Please pay attention to wha } \seealso{ -[googledrive::drive_ls()] +\code{\link[googledrive:drive_ls]{googledrive::drive_ls()}} } diff --git a/man/dropbox_upload.Rd b/man/dropbox_upload.Rd index 3dc94d5..d824c90 100644 --- a/man/dropbox_upload.Rd +++ b/man/dropbox_upload.Rd @@ -20,7 +20,7 @@ performs drop box upload Upload a local file to dropbox and handle authentication. } \details{ -This is a wrapper of `rdrop2::drop_upload()` which first reads in a local +This is a wrapper of \code{rdrop2::drop_upload()} which first reads in a local CSV file and then uploads to a DropBox path. } \examples{ diff --git a/man/get_odk_form_schema.Rd b/man/get_odk_form_schema.Rd index 5af7bb3..3444291 100644 --- a/man/get_odk_form_schema.Rd +++ b/man/get_odk_form_schema.Rd @@ -28,7 +28,7 @@ This function handles the authentication and pulling of questionnaire form schema information. } \details{ -This is a wrapper around the `ruODK` package. It handles the setup and +This is a wrapper around the \code{ruODK} package. It handles the setup and authentication. See \url{https://github.com/ropensci/ruODK} } \examples{ @@ -41,5 +41,5 @@ authentication. See \url{https://github.com/ropensci/ruODK} } \seealso{ -[ruODK::form_schema_ext()] +\code{\link[ruODK:form_schema_ext]{ruODK::form_schema_ext()}} } diff --git a/man/get_odk_responses.Rd b/man/get_odk_responses.Rd index a554382..a490289 100644 --- a/man/get_odk_responses.Rd +++ b/man/get_odk_responses.Rd @@ -26,10 +26,10 @@ data.frame of flattened survey responses \description{ This function handles the authentication and pulling of responses data for ODK Questionnaires. The raw return list is 'rectangularized' into -a data frame first. See the `ruODK` package for more info on how this happens. +a data frame first. See the \code{ruODK} package for more info on how this happens. } \details{ -This is a wrapper around the `ruODK` package. It handles the setup and +This is a wrapper around the \code{ruODK} package. It handles the setup and authentication. See \url{https://github.com/ropensci/ruODK} } \examples{ @@ -41,5 +41,5 @@ authentication. See \url{https://github.com/ropensci/ruODK} } } \seealso{ -[ruODK::form_schema_ext()] +\code{\link[ruODK:form_schema_ext]{ruODK::form_schema_ext()}} } diff --git a/man/guess_col_type.Rd b/man/guess_col_type.Rd index e87ad72..7c571d8 100644 --- a/man/guess_col_type.Rd +++ b/man/guess_col_type.Rd @@ -9,7 +9,7 @@ guess_col_type(data, default_col_abv = "c") \arguments{ \item{data}{data.frame Data who column types you would like to guess} -\item{default_col_abv}{string. Column type abbreviation from [readr::cols()]. +\item{default_col_abv}{string. Column type abbreviation from \code{\link[readr:cols]{readr::cols()}}. Use "g" to guess the column type.} } \value{ diff --git a/man/othertext_lookup.Rd b/man/othertext_lookup.Rd index 656be20..9ca03d6 100644 --- a/man/othertext_lookup.Rd +++ b/man/othertext_lookup.Rd @@ -23,8 +23,8 @@ link in the response data to match the captured responses and the other free-tex collected. This function provides a manual look up reference so free text responses can be compared to the original questions in the validation workflow. -This function can be expanded by providing a tibble with two columns: `name` and -`other_name` which maps the question name in ODK to the question name containing +This function can be expanded by providing a tibble with two columns: \code{name} and +\code{other_name} which maps the question name in ODK to the question name containing 'other' or 'free text'. } \examples{ diff --git a/man/read_excel_all_sheets.Rd b/man/read_excel_all_sheets.Rd index 1c4928e..d944e4b 100644 --- a/man/read_excel_all_sheets.Rd +++ b/man/read_excel_all_sheets.Rd @@ -32,7 +32,7 @@ with data. } \note{ The primary key method is possible because Excel forces sheet names - to be unique. +to be unique. } \examples{ \dontrun{ diff --git a/man/read_googlesheets.Rd b/man/read_googlesheets.Rd index 81f57a6..99d38bc 100644 --- a/man/read_googlesheets.Rd +++ b/man/read_googlesheets.Rd @@ -24,7 +24,7 @@ read_googlesheets( \item{primary_key}{character. The column name for the unique identifier to be added to the data.} -\item{...}{other arguments passed to `googlesheets4::range_read()`} +\item{...}{other arguments passed to \code{googlesheets4::range_read()}} } \value{ tibble @@ -39,5 +39,5 @@ read_googlesheets(ss = kzn_animal_ship_sheets, sheet = "all",) } \seealso{ -[googlesheets4::range_read()] +\code{\link[googlesheets4:range_read]{googlesheets4::range_read()}} } diff --git a/man/remove_deletions.Rd b/man/remove_deletions.Rd index ce2d7db..2f15b29 100644 --- a/man/remove_deletions.Rd +++ b/man/remove_deletions.Rd @@ -18,11 +18,11 @@ logical vector Filters for records matching a given string. } \details{ -To be used within `dplyr::filter()`. The function returns a logical vector -with TRUE resulting from values that are not equal to the `val` argument. Also +To be used within \code{dplyr::filter()}. The function returns a logical vector +with TRUE resulting from values that are not equal to the \code{val} argument. Also protects from NA values. -Used within verbs such as `tidyselect::all_of()` this can work effectively across all +Used within verbs such as \code{tidyselect::all_of()} this can work effectively across all columns in a data frame. See examples } \examples{ diff --git a/man/validation_checks.Rd b/man/validation_checks.Rd index 7c29576..ec69116 100644 --- a/man/validation_checks.Rd +++ b/man/validation_checks.Rd @@ -25,25 +25,31 @@ Validation correction tests to be run on data before and after validation to tes As part of the OH cleaning pipelines, raw data is converted to 'semi-clean' data through a process of upserting records from an external Validation Log. To ensure these corrections were made as expected, some checks are performed in this function. - -1. If no existing log exists > no changes are make to data - - Same variables - - same Rows - - No unequal values - -2. If log exists but no changes are recommended > no changes to data. - - Same variables - - same Rows - - No unequal values - -3. Log exists and changes recommended > number of changes are same as log - - Same variables - - same Rows - - Number of changing records in data match records in log - -4. Correct fields and records are being updated - - Checks before and after variables and rows are the same - - Checks the variable names and row indexes are the same in the logs and the changed data. +\enumerate{ +\item If no existing log exists > no changes are make to data +\itemize{ +\item Same variables +\item same Rows +\item No unequal values +} +\item If log exists but no changes are recommended > no changes to data. +\itemize{ +\item Same variables +\item same Rows +\item No unequal values +} +\item Log exists and changes recommended > number of changes are same as log +\itemize{ +\item Same variables +\item same Rows +\item Number of changing records in data match records in log +} +\item Correct fields and records are being updated +\itemize{ +\item Checks before and after variables and rows are the same +\item Checks the variable names and row indexes are the same in the logs and the changed data. +} +} } \examples{ \dontrun{