Merge pull request #557 from USEPA/TADA_CheckRequiredFields

update TADA_CheckRequiredFields and TADA_AutoClean
USEPA · Dec 23, 2024 · 06aded1 · 06aded1
2 parents f28f76a + f776a29
commit 06aded1
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 33 deletions.
diff --git a/R/RequiredCols.R b/R/RequiredCols.R
@@ -341,34 +341,55 @@ TADA_GetTemplate <- function() {
 
 
 
-#' TADA Module 1 Required Fields Check
+#' TADA Required Fields Check
 #'
-#' This function checks if all required fields for TADA Module 1 are
-#' included in the input dataframe.
+#' This function checks if all fields required to run TADA functions are included in the input 
+#' dataframe. It is used in the TADA Shiny application to test user supplied files for compatibility 
+#' with the application.
 #'
 #' @param .data A dataframe
 #'
-#' @return Boolean result indicating whether or not the input dataframe contains all of the TADA profile fields.
+#' @return Boolean result, TRUE or FALSE, indicating whether or not the input dataframe contains all
+#' of the required fields. If FALSE, an error will be returned that includes the names of all 
+#' missing columns.
 #'
 #' @export
 #'
 #' @examples
 #' \dontrun{
 #' # Find web service URLs for each Profile using WQP User Interface (https://www.waterqualitydata.us/)
 #' # Example WQP URL: https://www.waterqualitydata.us/#statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&providers=NWIS&providers=STEWARDS&providers=STORET
-#'
+#' 
 #' # Use TADA_ReadWQPWebServices to load the Station, Project, and Phys-Chem Result profiles
 #' stationProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Station/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
 #' physchemProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Result/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&dataProfile=resultPhysChem&providers=NWIS&providers=STEWARDS&providers=STORET")
 #' projectProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Project/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
-#'
+#' 
 #' # Join all three profiles using TADA_JoinWQPProfiles
-#' TADAProfile <- TADA_JoinWQPProfiles(FullPhysChem = physchemProfile, Sites = stationProfile, Projects = projectProfile)
-#'
-#' # Run TADA_CheckRequiredFields
-#' CheckRequirements_TADAProfile <- TADA_CheckRequiredFields(TADAProfile)
+#' TADAProfile <- TADA_JoinWQPProfiles(FullPhysChem = physchemProfile, Sites = stationProfile,
+#'                                     Projects = projectProfile)
+#' 
+#' # Run TADA_CheckRequiredFields, returns error message,
+#' # 'The dataframe does not contain the required fields: ActivityStartDateTime'
+#' TADA_CheckRequiredFields(TADAProfile)
+#' 
+#' # Add missing col
+#' TADAProfile1 <- dataRetrieval:::create_dateTime(df = TADAProfile,
+#'                                                 date_col = "ActivityStartDate",
+#'                                                 time_col = "ActivityStartTime.Time",
+#'                                                 tz_col = "ActivityStartTime.TimeZoneCode",
+#'                                                 tz = "UTC")
+#' 
+#' review_TADAProfile1 = TADAProfile1 %>% dplyr::select(c("ActivityStartDate",
+#'                                                        "ActivityStartTime.Time",
+#'                                                        "ActivityStartTime.TimeZoneCode",
+#'                                                        "ActivityStartDateTime",
+#'                                                        "ActivityStartTime.TimeZoneCode_offset"))
+#' 
+#' # re-run TADA_CheckRequiredFields, returns TRUE
+#' TADA_CheckRequiredFields(TADAProfile1)
 #' }
-#'
+#' 
 TADA_CheckRequiredFields <- function(.data) {
   # remove names with TADA. string from require.cols
   require.originals <- Filter(function(x) !any(grepl("TADA.", x)), require.cols)
@@ -380,8 +401,10 @@ TADA_CheckRequiredFields <- function(.data) {
   if (all(require.originals %in% colnames(.data)) == TRUE) {
     TRUE
   } else {
-    stop("The dataframe does not contain the required fields.")
-  }
+    missingcols <- base::setdiff(require.originals, colnames(.data))
+    stop("TADA_CheckRequiredFields: the dataframe does not contain the required fields: ", 
+               paste(as.character(missingcols), 
+                     collapse = ", "))  }
 }
 
 

diff --git a/R/Utilities.R b/R/Utilities.R
@@ -224,7 +224,6 @@ TADA_AutoClean <- function(.data) {
   # execute function after checks are passed
 
 
-
   # check to make sure columns do not already exist and capitalize fields with known synonyms that
   # only differ in caps
   print("TADA_Autoclean: creating TADA-specific columns.")
@@ -272,6 +271,18 @@ TADA_AutoClean <- function(.data) {
     .data$TADA.ResultMeasure.MeasureUnitCode <- toupper(.data$ResultMeasure.MeasureUnitCode)
   }
 
+  if ("ActivityStartDateTime" %in% colnames(.data)) {
+    .data <- .data
+  } else {
+    # creates ActivityStartDateTime and ActivityStartTime.TimeZoneCode_offset
+    # this is only needed when dataRetrieval is not used to get WQP data
+    .data <- dataRetrieval:::create_dateTime(df = .data, 
+                                             date_col = "ActivityStartDate", 
+                                             time_col = "ActivityStartTime.Time",
+                                             tz_col = "ActivityStartTime.TimeZoneCode", 
+                                             tz = "UTC")
+  }
+
   # Transform "Dissolved oxygen (DO)" characteristic name to "DISSOLVED OXYGEN SATURATION" IF
   # result unit is "%" or "% SATURATN".
 
@@ -915,7 +926,9 @@ TADA_GetUniqueNearbySites <- function(.data) {
 #'
 #' Retrieves data for a period of time in the past 20 years using
 #' TADA_DataRetrieval. This function can be used for testing functions on
-#' random datasets.
+#' random datasets. Only random data sets with 10 or more results will be returned.
+#' If a random dataset has fewer than 10 results, the function will automatically
+#' create another random WQP query until a df with greater than 10 results is returned.
 #'
 #' @param number_of_days Numeric. The default is 1, which will query and retrieve
 #' data for a random two-day period (e.g.startDate = "2015-04-21",
@@ -943,20 +956,23 @@ TADA_GetUniqueNearbySites <- function(.data) {
 #' df <- TADA_RandomTestingData(number_of_days = 5, choose_random_state = TRUE, autoclean = FALSE)
 #' }
 #'
-TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE, autoclean = TRUE) {
-  while (TRUE) {
+TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE, 
+                                   autoclean = TRUE) {
+
+  get_random_data <-  function(ndays = number_of_days, state_choice = choose_random_state, 
+                               ac = autoclean) {
     # choose a random day within the last 20 years
     twenty_yrs_ago <- Sys.Date() - 20 * 365
     random_start_date <- twenty_yrs_ago + sample(20 * 365, 1)
     # choose a random start date and add any number_of_days (set that as the end date)
-    end_date <- random_start_date + number_of_days
+    end_date <- random_start_date + ndays
 
-    if (choose_random_state == TRUE) {
+    if (state_choice == TRUE) {
       load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA"))
       state <- sample(statecodes_df$STUSAB, 1)
     }
 
-    if (choose_random_state == FALSE) {
+    if (state_choice == FALSE) {
       state <- "null"
     }
 
@@ -966,7 +982,7 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL
       statecode = state
     ))
 
-    if (autoclean == TRUE) {
+    if (ac == TRUE) {
       dat <- TADA_DataRetrieval(
         startDate = as.character(random_start_date),
         endDate = as.character(end_date),
@@ -975,19 +991,26 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL
       )
     }
 
-    if (autoclean == FALSE) {
+    if (ac == FALSE) {
       dat <- TADA_DataRetrieval(
         startDate = as.character(random_start_date),
         endDate = as.character(end_date),
         statecode = state,
         applyautoclean = FALSE
       )
     }
-
-    if (nrow(dat) > 0) {
-      return(dat)
+    return(dat)
+  }
+
+  verify_random_data <- function() {
+    df <- get_random_data()
+    while(nrow(df) < 10) {
+      df <- get_random_data()
     }
+    return(df)
   }
+
+  verify_random_data()
 }
 
 #' Aggregate multiple result values to a min, max, or mean

diff --git a/man/TADA_CheckRequiredFields.Rd b/man/TADA_CheckRequiredFields.Rd
diff --git a/man/TADA_RandomTestingData.Rd b/man/TADA_RandomTestingData.Rd