Skip to content

Commit

Permalink
updated docs to use markdown and refactored the validation_checks code
Browse files Browse the repository at this point in the history
  • Loading branch information
collinschwantes committed Jul 17, 2024
1 parent 91c2e62 commit d718a97
Show file tree
Hide file tree
Showing 22 changed files with 132 additions and 95 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ohcleandat
Type: Package
Title: One Health Data Cleaning and Quality Checking Package
Version: 0.2.4
Version: 0.2.5
Authors@R: c(
person("Collin", "Schwantes", email = "[email protected]", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")),
person("Johana", "Teigen", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")),
Expand Down Expand Up @@ -46,3 +46,4 @@ Remotes:
URL: https://ecohealthalliance.github.io/ohcleandat/
Depends:
R (>= 2.10)
Roxygen: list(markdown = TRUE)
4 changes: 2 additions & 2 deletions R/class_to_col_type.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#' \describe{
#' \item{col_type}{Type of column as described in `readr`}
#' \item{col_class}{Class of R object that matches that column type}
#' \item{col_abv}{Abbreviation for that column type from `reader`}
#' \item{col_abv}{Abbreviation for that column type from `readr`}
#' ...
#' }
#' @seealso [reader::cols()]
#' @seealso [readr::cols()]
"class_to_col_type"
78 changes: 41 additions & 37 deletions R/validation_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,20 @@
#' corrections were made as expected, some checks are performed in this function.
#'
#' 1. If no existing log exists > no changes are make to data
#' - Same variables
#' - same Rows
#' - No unequal values
#'
#' * Same variables
#' * same Rows
#' * No unequal values
#' 2. If log exists but no changes are recommended > no changes to data.
#' - Same variables
#' - same Rows
#' - No unequal values
#'
#' * Same variables
#' * same Rows
#' * No unequal values
#' 3. Log exists and changes recommended > number of changes are same as log
#' - Same variables
#' - same Rows
#' - Number of changing records in data match records in log
#'
#' * Same variables
#' * same Rows
#' * Number of changing records in data match records in log
#' 4. Correct fields and records are being updated
#' - Checks before and after variables and rows are the same
#' - Checks the variable names and row indexes are the same in the logs and the changed data.
#' * Checks before and after variables and rows are the same
#' * Checks the variable names and row indexes are the same in the logs and the changed data.
#'
#' @param validation_log tibble Validation log
#' @param before_data tibble Data before corrections
Expand All @@ -50,8 +47,8 @@ validation_checks <-
after_data,
idcol) {
if (!is.null(validation_log)) {
# calculate number of assumed changes from log
changes <- validation_log |>
# preprocess the log
preprocess_log <- validation_log |>
dplyr::filter(
is_valid == "FALSE" | is_valid == "F",
!is.na(field),
Expand All @@ -60,43 +57,50 @@ validation_checks <-
entry != "",
new_val != ""
) |>
dplyr::mutate("entry_field" = paste(entry,field,sep = "_")) |>
dplyr::mutate("entry_field_dupe" = duplicated(entry_field,
fromLast = TRUE)) # check for duplicate entry-field combos.

## warning message for duplicate field and entry items
if(any(!preprocess_log$entry_field_dupe)){
rlang::warn("Detected duplicate entry-field combination. The same item has been corrected at least twice in the log")
}

## message about reversions
if(any(preprocess_log$old_value == preprocess_log$new_val)){
rlang::inform("Reversion to the an original value detected in the log.")
}

# drop duplicate or reversion changes
validation_log_filtered <- preprocess_log|>
dplyr::filter(
# keep the last entry-field item for any repeated entry-field combos
!entry_field_dupe,
## remove any changes that are reversions in the original value
new_val != old_value)

expected_changes <- validation_log_filtered |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::pull(n)

val_fields <- validation_log |>
dplyr::filter(
is_valid == "FALSE" | is_valid == "F",
!is.na(field),
field != "",
!is.na(entry),
entry != "",
new_val != ""
) |>
val_fields <- validation_log_filtered |>
dplyr::pull(field) |>
unique()

val_recs <- validation_log |>
dplyr::filter(
is_valid == "FALSE" | is_valid == "F",
!is.na(field),
field != "",
!is.na(entry),
entry != "",
new_val != ""
) |>
val_recs <- validation_log_filtered |>
dplyr::pull(entry) |>
unique()

val_recs_idx <-
which(dplyr::pull(before_data[idcol]) %in% val_recs)
}

# perform dataframe comparison
# perform dataframe comparison ----

cd <- arsenal::comparedf(before_data, after_data)
s <- summary(cd)

### TESTS
### TESTS ----

# TEST: If no existing log exists > no changes are make to data
# same vars
Expand Down Expand Up @@ -133,7 +137,7 @@ validation_checks <-
# number of changing records in data match records in log
test3 <- if (!is.null(validation_log) & NROW(validation_log) > 0) {
all(
s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == changes,
s$comparison.summary.table[s$comparison.summary.table$statistic == "Number of values unequal", "value"] == expected_changes,
s$frame.summary.table$ncol[1] == s$frame.summary.table$ncol[2],
s$frame.summary.table$nrow[1] == s$frame.summary.table$nrow[2]
)
Expand Down
4 changes: 3 additions & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ utils::globalVariables(
"log_response_id",
"n",
"rowid",
"set_diff"
"set_diff",
"entry_field",
"entry_field_dupe"
)
)
File renamed without changes.
10 changes: 10 additions & 0 deletions inst/test_val_log_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ID,animal_id
id_1,abc-123
id_2,abc-130
id_3,abc-125
id_4,abc-126
id_5,abc-127
id_6,abc-128
id_7,abc-129
id_9,abc-131
id_10,ABC-132
12 changes: 12 additions & 0 deletions inst/test_validation_log.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rowid,log_response_id,entry,field,issue,old_value,is_valid,new_val,user_initials,comments
1,23,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries
2,51,id_2,animal_id,Results: animal_id (ID Number) improperly formatted,abc-130,F,ABC-130,,
3,11,id_3,animal_id,Results: animal_id (ID Number) improperly formatted,abc-125,F,ABC-125,,
4,39,id_4,animal_id,Results: animal_id (ID Number) improperly formatted,abc-126,F,ABC-126,,
5,12,id_5,animal_id,Results: animal_id (ID Number) improperly formatted,abc-127,F,ABC-127,,
6,40,id_6,animal_id,Results: animal_id (ID Number) improperly formatted,abc-128,F,ABC-128,,
7,13,id_7,animal_id,Results: animal_id (ID Number) improperly formatted,abc-129,F,ABC-129,,
8,41,id_1,animal_id,Results: animal_id (ID Number) improperly formatted,abc-123,F,ABC-123,,For testing duplicate entries
9,5,id_9,animal_id,Results: animal_id (ID Number) improperly formatted,abc-131,F,ABC-131,,
10,33,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,abc-132,,For testing reversions
11,34,id_10,animal_id,Results: animal_id (ID Number) improperly formatted,ABC-132,F,ABC-132,,For testing reversions
4 changes: 2 additions & 2 deletions man/check_id_existence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions man/class_to_col_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions man/create_freetext_log.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/detect_language.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/download_dropbox.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/download_googledrive_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/dropbox_upload.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/get_odk_form_schema.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/get_odk_responses.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/guess_col_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/othertext_lookup.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/read_excel_all_sheets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/read_googlesheets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d718a97

Please sign in to comment.