From d718a9731ef3708b80d2470d2ea392fe82e31407 Mon Sep 17 00:00:00 2001
From: Collin Schwantes <schwantes@ecohealthalliance.org>
Date: Wed, 17 Jul 2024 10:12:35 -0600
Subject: [PATCH] updated docs to use markdown and refactored the
 validation_checks code

---
 DESCRIPTION                                  |   3 +-
 R/class_to_col_type.R                        |   4 +-
 R/validation_checks.R                        |  78 ++++++++++---------
 R/zzz.R                                      |   4 +-
 inst/{test_pk.xlsx => test_primary_key.xlsx} | Bin
 inst/test_val_log_data.csv                   |  10 +++
 inst/test_validation_log.csv                 |  12 +++
 man/check_id_existence.Rd                    |   4 +-
 man/class_to_col_type.Rd                     |  16 ++--
 man/create_freetext_log.Rd                   |  14 ++--
 man/detect_language.Rd                       |   2 +-
 man/download_dropbox.Rd                      |   2 +-
 man/download_googledrive_files.Rd            |   4 +-
 man/dropbox_upload.Rd                        |   2 +-
 man/get_odk_form_schema.Rd                   |   4 +-
 man/get_odk_responses.Rd                     |   6 +-
 man/guess_col_type.Rd                        |   2 +-
 man/othertext_lookup.Rd                      |   4 +-
 man/read_excel_all_sheets.Rd                 |   2 +-
 man/read_googlesheets.Rd                     |   4 +-
 man/remove_deletions.Rd                      |   6 +-
 man/validation_checks.Rd                     |  44 ++++++-----
 22 files changed, 132 insertions(+), 95 deletions(-)
 rename inst/{test_pk.xlsx => test_primary_key.xlsx} (100%)
 create mode 100644 inst/test_val_log_data.csv
 create mode 100644 inst/test_validation_log.csv

diff --git a/DESCRIPTION b/DESCRIPTION
index deee6df..cb4e32a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: ohcleandat
 Type: Package
 Title: One Health Data Cleaning and Quality Checking Package
-Version: 0.2.4
+Version: 0.2.5
 Authors@R: c(
     person("Collin", "Schwantes", email = "schwantes@ecohealthalliance.org", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")),
     person("Johana", "Teigen", email = "teigen@ecohealthalliance.org", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")),
@@ -46,3 +46,4 @@ Remotes:
 URL: https://ecohealthalliance.github.io/ohcleandat/
 Depends: 
     R (>= 2.10)
+Roxygen: list(markdown = TRUE)
diff --git a/R/class_to_col_type.R b/R/class_to_col_type.R
index dee8b56..64164e1 100644
--- a/R/class_to_col_type.R
+++ b/R/class_to_col_type.R
@@ -11,8 +11,8 @@
 #' \describe{
 #'   \item{col_type}{Type of column as described in `readr`}
 #'   \item{col_class}{Class of R object that matches that column type}
-#'   \item{col_abv}{Abbreviation for that column type from `reader`}
+#'   \item{col_abv}{Abbreviation for that column type from `readr`}
 #'   ...
 #' }
-#' @seealso [reader::cols()]
+#' @seealso [readr::cols()]
 "class_to_col_type"
diff --git a/R/validation_checks.R b/R/validation_checks.R
index 5d21e5b..b09675a 100644
--- a/R/validation_checks.R
+++ b/R/validation_checks.R
@@ -9,23 +9,20 @@
 #' corrections were made as expected, some checks are performed in this function.
 #'
 #' 1. If no existing log exists > no changes are make to data
-#'   - Same variables
-#'   - same Rows
-#'   - No unequal values
-#'
+#'    * Same variables
+#'    * same Rows
+#'    * No unequal values
 #' 2. If log exists but no changes are recommended > no changes to data.
-#'   - Same variables
-#'   - same Rows
-#'   - No unequal values
-#'
+#'    * Same variables
+#'    * same Rows
+#'    * No unequal values
 #' 3. Log exists and changes recommended > number of changes are same as log
-#'   - Same variables
-#'   - same Rows
-#'   - Number of changing records in data match records in log
-#'
+#'    * Same variables
+#'    * same Rows
+#'    * Number of changing records in data match records in log
 #' 4. Correct fields and records are being updated
-#'   - Checks before and after variables and rows are the same
-#'   - Checks the variable names and row indexes are the same in the logs and the changed data.
+#'    * Checks before and after variables and rows are the same
+#'    * Checks the variable names and row indexes are the same in the logs and the changed data.
 #'
 #' @param validation_log tibble Validation log
 #' @param before_data tibble Data before corrections
@@ -50,8 +47,8 @@ validation_checks <-
            after_data,
            idcol) {
     if (!is.null(validation_log)) {
-      # calculate number of assumed changes from log
-      changes <- validation_log |>
+      # preprocess the log
+      preprocess_log <- validation_log |>
         dplyr::filter(
           is_valid == "FALSE" | is_valid == "F",
           !is.na(field),
@@ -60,30 +57,37 @@ validation_checks <-
           entry != "",
           new_val != ""
         ) |>
+        dplyr::mutate("entry_field" = paste(entry,field,sep = "_")) |>
+        dplyr::mutate("entry_field_dupe" =  duplicated(entry_field,
+                                                       fromLast = TRUE)) # check for duplicate entry-field combos.
+
+        ## warning message for duplicate field and entry items
+        if(any(!preprocess_log$entry_field_dupe)){
+          rlang::warn("Detected duplicate entry-field combination. The same item has been corrected at least twice in the log")
+        }
+
+      ## message about reversions
+      if(any(preprocess_log$old_value == preprocess_log$new_val)){
+        rlang::inform("Reversion to the an original value detected in the log.")
+      }
+
+      # drop duplicate or reversion changes
+      validation_log_filtered <- preprocess_log|>
+        dplyr::filter(
+        # keep the last entry-field item for any repeated entry-field combos
+        !entry_field_dupe,
+        ## remove any changes that are reversions in the original value
+        new_val != old_value)
+
+      expected_changes <- validation_log_filtered |>
         dplyr::summarise(n = dplyr::n()) |>
         dplyr::pull(n)
 
-      val_fields <- validation_log |>
-        dplyr::filter(
-          is_valid == "FALSE" | is_valid == "F",
-          !is.na(field),
-          field != "",
-          !is.na(entry),
-          entry != "",
-          new_val != ""
-        ) |>
+      val_fields <- validation_log_filtered |>
         dplyr::pull(field) |>
         unique()
 
-      val_recs <- validation_log |>
-        dplyr::filter(
-          is_valid == "FALSE" | is_valid == "F",
-          !is.na(field),
-          field != "",
-          !is.na(entry),
-          entry != "",
-          new_val != ""
-        ) |>
+      val_recs <- validation_log_filtered |>
         dplyr::pull(entry) |>
         unique()
 
@@ -91,12 +95,12 @@ validation_checks <-
         which(dplyr::pull(before_data[idcol]) %in% val_recs)
     }
 
-    # perform dataframe comparison
+    # perform dataframe comparison ----
 
     cd <- arsenal::comparedf(before_data, after_data)
     s <- summary(cd)
 
-    ### TESTS
+    ### TESTS ----
 
     # TEST: If no existing log exists > no changes are make to data
     # same vars
@@ -133,7 +137,7 @@ validation_checks <-
     # number of changing records in data match records in log
     test3 <- if (!is.null(validation_log) & NROW(validation_log) > 0) {
       all(
-        s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == changes,
+        s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == expected_changes,
         s$frame.summary.table$ncol[1] ==  s$frame.summary.table$ncol[2],
         s$frame.summary.table$nrow[1] ==  s$frame.summary.table$nrow[2]
       )
diff --git a/R/zzz.R b/R/zzz.R
index 6d9fb4c..dafdbcd 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -28,6 +28,8 @@ utils::globalVariables(
   "log_response_id",
   "n",
   "rowid",
-  "set_diff"
+  "set_diff",
+  "entry_field",
+  "entry_field_dupe"
   )
 )
diff --git a/inst/test_pk.xlsx b/inst/test_primary_key.xlsx
similarity index 100%
rename from inst/test_pk.xlsx
rename to inst/test_primary_key.xlsx
diff --git a/inst/test_val_log_data.csv b/inst/test_val_log_data.csv
new file mode 100644
index 0000000..f3bd0c3
--- /dev/null
+++ b/inst/test_val_log_data.csv
@@ -0,0 +1,10 @@
+﻿ID,animal_id
+id_1,abc-123
+id_2,abc-130
+id_3,abc-125
+id_4,abc-126
+id_5,abc-127
+id_6,abc-128
+id_7,abc-129
+id_9,abc-131
+id_10,ABC-132
\ No newline at end of file
diff --git a/inst/test_validation_log.csv b/inst/test_validation_log.csv
new file mode 100644
index 0000000..2d9907a
--- /dev/null
+++ b/inst/test_validation_log.csv
@@ -0,0 +1,12 @@
+rowid,log_response_id,entry,field,issue,old_value,is_valid,new_val,user_initials,comments
+1,23,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries
+2,51,id_2,animal_id,Results: animal_id (ID Number) improperly formatted,abc-130,F,ABC-130,,
+3,11,id_3,animal_id,Results: animal_id (ID Number) improperly formatted,abc-125,F,ABC-125,,
+4,39,id_4,animal_id,Results: animal_id (ID Number) improperly formatted,abc-126,F,ABC-126,,
+5,12,id_5,animal_id,Results: animal_id (ID Number) improperly formatted,abc-127,F,ABC-127,,
+6,40,id_6,animal_id,Results: animal_id (ID Number) improperly formatted,abc-128,F,ABC-128,,
+7,13,id_7,animal_id,Results: animal_id (ID Number) improperly formatted,abc-129,F,ABC-129,,
+8,41,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries
+9,5,id_9,animal_id,Results: animal_id (ID Number) improperly formatted,abc-131,F,ABC-131,,
+10,33,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,abc-132,,For testing reversions
+11,34,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,ABC-132,,For testing reversions
\ No newline at end of file
diff --git a/man/check_id_existence.Rd b/man/check_id_existence.Rd
index 16908f3..59b3f57 100644
--- a/man/check_id_existence.Rd
+++ b/man/check_id_existence.Rd
@@ -22,7 +22,7 @@ tibble rows from x without a match in y
 }
 \description{
 This returns rows in x without a match in y. Returning selected columns only. It
-is a this wrapper around `dplyr::anti_join`.
+is a this wrapper around \code{dplyr::anti_join}.
 }
 \examples{
 \dontrun{
@@ -33,5 +33,5 @@ check_id_existence(x,
 }
 }
 \seealso{
-`dplyr::anti_join`
+\code{dplyr::anti_join}
 }
diff --git a/man/class_to_col_type.Rd b/man/class_to_col_type.Rd
index c93e0dc..04b4307 100644
--- a/man/class_to_col_type.Rd
+++ b/man/class_to_col_type.Rd
@@ -5,20 +5,22 @@
 \alias{class_to_col_type}
 \title{Class to Column Type lookup table}
 \format{
-## `class_to_col_type`
+\subsection{\code{class_to_col_type}}{
+
 A data frame with 9 rows and 3 columns:
 \describe{
-  \item{col_type}{Type of column as described in `readr`}
-  \item{col_class}{Class of R object that matches that column type}
-  \item{col_abv}{Abbreviation for that column type from `reader`}
-  ...
+\item{col_type}{Type of column as described in \code{readr}}
+\item{col_class}{Class of R object that matches that column type}
+\item{col_abv}{Abbreviation for that column type from \code{readr}}
+...
+}
 }
 }
 \usage{
 class_to_col_type
 }
 \description{
-A table that links classes to `readr` column types.
+A table that links classes to \code{readr} column types.
 Created from csv file of the same name in inst/
 }
 \details{
@@ -26,6 +28,6 @@ class_to_col_type <- read.csv(file = "inst/class_to_col_type.csv")
 usethis::use_data(class_to_col_type,overwrite = TRUE)
 }
 \seealso{
-[reader::cols()]
+\code{\link[readr:cols]{readr::cols()}}
 }
 \keyword{datasets}
diff --git a/man/create_freetext_log.Rd b/man/create_freetext_log.Rd
index 28355da..1cfda34 100644
--- a/man/create_freetext_log.Rd
+++ b/man/create_freetext_log.Rd
@@ -14,7 +14,7 @@ create_freetext_log(response_data, form_schema, url, lookup)
 \item{url}{The ODK submission URL excluding the uuid identifier}
 
 \item{lookup}{a tibble formatted as a lookup to match questions with their free text responses. The format must match
-the output of `othertext_lookup()`. This function can be passed to this function argument as a convenient handler for this value.}
+the output of \code{othertext_lookup()}. This function can be passed to this function argument as a convenient handler for this value.}
 }
 \value{
 data.frame validation log
@@ -25,11 +25,11 @@ multi-choice options.
 }
 \details{
 This function needs to link a survey question with its corresponding free text response. Users can use the
-`othertext_lookup()` function to handle this, or provide their own tibble in the same format. See below:
- tibble::tribble(
- ~name, ~other_name,
- question_1, question_1_other
- )
+\code{othertext_lookup()} function to handle this, or provide their own tibble in the same format. See below:
+tibble::tribble(
+~name, ~other_name,
+question_1, question_1_other
+)
 }
 \examples{
 \dontrun{
@@ -55,5 +55,5 @@ mylookup <- tibble::tribble(
 
 }
 \seealso{
-[ohcleandat::othertext_lookup()]
+\code{\link[=othertext_lookup]{othertext_lookup()}}
 }
diff --git a/man/detect_language.Rd b/man/detect_language.Rd
index 48466e9..b3c362e 100644
--- a/man/detect_language.Rd
+++ b/man/detect_language.Rd
@@ -23,5 +23,5 @@ detect_language(text = "buongiorno")
 
 }
 \seealso{
-[stringi::stri_enc_detect()]
+\code{\link[stringi:stri_enc_detect]{stringi::stri_enc_detect()}}
 }
diff --git a/man/download_dropbox.Rd b/man/download_dropbox.Rd
index a5346bc..c2ad9b1 100644
--- a/man/download_dropbox.Rd
+++ b/man/download_dropbox.Rd
@@ -31,5 +31,5 @@ Downloads files from dropbox into a given directory
 
 }
 \seealso{
-[rdrop2::drop_download()]
+\code{\link[rdrop2:drop_download]{rdrop2::drop_download()}}
 }
diff --git a/man/download_googledrive_files.Rd b/man/download_googledrive_files.Rd
index e9b4bb4..3ae687a 100644
--- a/man/download_googledrive_files.Rd
+++ b/man/download_googledrive_files.Rd
@@ -31,7 +31,7 @@ For a given Google Drive folder this function will find and download all files
 matching a given pattern.
 }
 \details{
-Note: This relies on the `googledrive::drive_ls()` function which uses a search function
+Note: This relies on the \code{googledrive::drive_ls()} function which uses a search function
 and is not deterministic when recursively searching. Please pay attention to what is returned.
 }
 \examples{
@@ -46,5 +46,5 @@ and is not deterministic when recursively searching. Please pay attention to wha
 
 }
 \seealso{
-[googledrive::drive_ls()]
+\code{\link[googledrive:drive_ls]{googledrive::drive_ls()}}
 }
diff --git a/man/dropbox_upload.Rd b/man/dropbox_upload.Rd
index 3dc94d5..d824c90 100644
--- a/man/dropbox_upload.Rd
+++ b/man/dropbox_upload.Rd
@@ -20,7 +20,7 @@ performs drop box upload
 Upload a local file to dropbox and handle authentication.
 }
 \details{
-This is a wrapper of `rdrop2::drop_upload()` which first reads in a local
+This is a wrapper of \code{rdrop2::drop_upload()} which first reads in a local
 CSV file and then uploads to a DropBox path.
 }
 \examples{
diff --git a/man/get_odk_form_schema.Rd b/man/get_odk_form_schema.Rd
index 5af7bb3..3444291 100644
--- a/man/get_odk_form_schema.Rd
+++ b/man/get_odk_form_schema.Rd
@@ -28,7 +28,7 @@ This function handles the authentication and pulling of questionnaire
 form schema information.
 }
 \details{
-This is a wrapper around the `ruODK` package. It handles the setup and
+This is a wrapper around the \code{ruODK} package. It handles the setup and
 authentication. See \url{https://github.com/ropensci/ruODK}
 }
 \examples{
@@ -41,5 +41,5 @@ authentication. See \url{https://github.com/ropensci/ruODK}
 
 }
 \seealso{
-[ruODK::form_schema_ext()]
+\code{\link[ruODK:form_schema_ext]{ruODK::form_schema_ext()}}
 }
diff --git a/man/get_odk_responses.Rd b/man/get_odk_responses.Rd
index a554382..a490289 100644
--- a/man/get_odk_responses.Rd
+++ b/man/get_odk_responses.Rd
@@ -26,10 +26,10 @@ data.frame of flattened survey responses
 \description{
 This function handles the authentication and pulling of responses
 data for ODK Questionnaires. The raw return list is 'rectangularized' into
-a data frame first. See the `ruODK` package for more info on how this happens.
+a data frame first. See the \code{ruODK} package for more info on how this happens.
 }
 \details{
-This is a wrapper around the `ruODK` package. It handles the setup and
+This is a wrapper around the \code{ruODK} package. It handles the setup and
 authentication. See \url{https://github.com/ropensci/ruODK}
 }
 \examples{
@@ -41,5 +41,5 @@ authentication. See \url{https://github.com/ropensci/ruODK}
 }
 }
 \seealso{
-[ruODK::form_schema_ext()]
+\code{\link[ruODK:form_schema_ext]{ruODK::form_schema_ext()}}
 }
diff --git a/man/guess_col_type.Rd b/man/guess_col_type.Rd
index e87ad72..7c571d8 100644
--- a/man/guess_col_type.Rd
+++ b/man/guess_col_type.Rd
@@ -9,7 +9,7 @@ guess_col_type(data, default_col_abv = "c")
 \arguments{
 \item{data}{data.frame Data who column types you would like to guess}
 
-\item{default_col_abv}{string. Column type abbreviation from [readr::cols()].
+\item{default_col_abv}{string. Column type abbreviation from \code{\link[readr:cols]{readr::cols()}}.
 Use "g" to guess the column type.}
 }
 \value{
diff --git a/man/othertext_lookup.Rd b/man/othertext_lookup.Rd
index 656be20..9ca03d6 100644
--- a/man/othertext_lookup.Rd
+++ b/man/othertext_lookup.Rd
@@ -23,8 +23,8 @@ link in the response data to match the captured responses and the other free-tex
 collected. This function provides a manual look up reference so free text responses
 can be compared to the original questions in the validation workflow.
 
-This function can be expanded by providing a tibble with two columns: `name` and
-`other_name` which maps the question name in ODK to the question name containing
+This function can be expanded by providing a tibble with two columns: \code{name} and
+\code{other_name} which maps the question name in ODK to the question name containing
 'other' or 'free text'.
 }
 \examples{
diff --git a/man/read_excel_all_sheets.Rd b/man/read_excel_all_sheets.Rd
index 1c4928e..d944e4b 100644
--- a/man/read_excel_all_sheets.Rd
+++ b/man/read_excel_all_sheets.Rd
@@ -32,7 +32,7 @@ with data.
 }
 \note{
 The primary key method is possible because Excel forces sheet names
- to be unique.
+to be unique.
 }
 \examples{
  \dontrun{
diff --git a/man/read_googlesheets.Rd b/man/read_googlesheets.Rd
index 81f57a6..99d38bc 100644
--- a/man/read_googlesheets.Rd
+++ b/man/read_googlesheets.Rd
@@ -24,7 +24,7 @@ read_googlesheets(
 
 \item{primary_key}{character. The column name for the unique identifier to be added to the data.}
 
-\item{...}{other arguments passed to `googlesheets4::range_read()`}
+\item{...}{other arguments passed to \code{googlesheets4::range_read()}}
 }
 \value{
 tibble
@@ -39,5 +39,5 @@ read_googlesheets(ss = kzn_animal_ship_sheets, sheet = "all",)
 
 }
 \seealso{
-[googlesheets4::range_read()]
+\code{\link[googlesheets4:range_read]{googlesheets4::range_read()}}
 }
diff --git a/man/remove_deletions.Rd b/man/remove_deletions.Rd
index ce2d7db..2f15b29 100644
--- a/man/remove_deletions.Rd
+++ b/man/remove_deletions.Rd
@@ -18,11 +18,11 @@ logical vector
 Filters for records matching a given string.
 }
 \details{
-To be used within `dplyr::filter()`. The function returns a logical vector
-with TRUE resulting from values that are not equal to the `val` argument. Also
+To be used within \code{dplyr::filter()}. The function returns a logical vector
+with TRUE resulting from values that are not equal to the \code{val} argument. Also
 protects from NA values.
 
-Used within verbs such as `tidyselect::all_of()` this can work effectively across all
+Used within verbs such as \code{tidyselect::all_of()} this can work effectively across all
 columns in a data frame. See examples
 }
 \examples{
diff --git a/man/validation_checks.Rd b/man/validation_checks.Rd
index 7c29576..ec69116 100644
--- a/man/validation_checks.Rd
+++ b/man/validation_checks.Rd
@@ -25,25 +25,31 @@ Validation correction tests to be run on data before and after validation to tes
 As part of the OH cleaning pipelines, raw data is converted to 'semi-clean' data
 through a process of upserting records from an external Validation Log. To ensure these
 corrections were made as expected, some checks are performed in this function.
-
-1. If no existing log exists > no changes are make to data
-  - Same variables
-  - same Rows
-  - No unequal values
-
-2. If log exists but no changes are recommended > no changes to data.
-  - Same variables
-  - same Rows
-  - No unequal values
-
-3. Log exists and changes recommended > number of changes are same as log
-  - Same variables
-  - same Rows
-  - Number of changing records in data match records in log
-
-4. Correct fields and records are being updated
-  - Checks before and after variables and rows are the same
-  - Checks the variable names and row indexes are the same in the logs and the changed data.
+\enumerate{
+\item If no existing log exists > no changes are make to data
+\itemize{
+\item Same variables
+\item same Rows
+\item No unequal values
+}
+\item If log exists but no changes are recommended > no changes to data.
+\itemize{
+\item Same variables
+\item same Rows
+\item No unequal values
+}
+\item Log exists and changes recommended > number of changes are same as log
+\itemize{
+\item Same variables
+\item same Rows
+\item Number of changing records in data match records in log
+}
+\item Correct fields and records are being updated
+\itemize{
+\item Checks before and after variables and rows are the same
+\item Checks the variable names and row indexes are the same in the logs and the changed data.
+}
+}
 }
 \examples{
 \dontrun{