From 67934d2f2cded5af53e0388dd359ebba3421f910 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Mon, 23 Dec 2024 15:01:03 -0700
Subject: [PATCH 1/8] fixing packages

---
 modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
index 8fed610d2..541003f8e 100644
--- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
+++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
@@ -15,13 +15,8 @@ The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gy
 You can Download as a CSV in your current working directory.  Note its also available at: 	http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv 
 
 ```{r}
-library(readr)
 library(tidyverse)
-library(dplyr)
-library(lubridate)
 library(jhur)
-library(tidyverse)
-library(broom)
 # install.packages("naniar")
 library(naniar)
 ```

From bfdf6a9d0ac0ccd9a2365c36d8454da441964856 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Mon, 23 Dec 2024 15:04:00 -0700
Subject: [PATCH 2/8] fixing cleaning lab variable to exposure

---
 .../lab/Data_Cleaning_Lab_Key.Rmd             | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
index 541003f8e..1fdfb7ac3 100644
--- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
+++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
@@ -91,7 +91,7 @@ Let's say we the data like so:
 
 ```{r}
 BloodType <- tibble(
-  weight_loss =
+  exposure =
     c(
       "Y", "No", "Yes", "y", "no",
       "n", "No", "N", "yes", "Yes",
@@ -116,20 +116,20 @@ There are some issues with this data that we need to figure out!
 
 ### 2.1
 
-Determine how many `NA` values there are for `weight_loss` (assume you know that`N` and `n` is for no).
+Determine how many `NA` values there are for `exposure` (assume you know that`N` and `n` is for no).
 
 ```{r 2.1response}
-count(BloodType, weight_loss) # the simple way
-sum(is.na(pull(BloodType, weight_loss))) # another way
+count(BloodType, exposure) # the simple way
+sum(is.na(pull(BloodType, exposure))) # another way
 BloodType %>% # another way
-  pull(weight_loss) %>%
+  pull(exposure) %>%
   is.na() %>%
   sum()
 ```
 
 ### 2.2
 
-Recode the `weight_loss` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Keep "Other" as "Other". Don't forget to use quotes!
+Recode the `exposure` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Keep "Other" as "Other". Don't forget to use quotes!
 
 ```
 # General format
@@ -144,21 +144,21 @@ NEW_TIBBLE <- OLD_TIBBLE %>%
 ```{r 2.2response}
 
 BloodType <- BloodType %>%
-  mutate(weight_loss = case_when(
-    weight_loss %in% c("N", "n", "No", "no") ~ "No",
-    weight_loss %in% c("Y", "y", "Yes", "yes") ~ "Yes",
-    TRUE ~ weight_loss # the only other value is an NA so we could include this or we don't need to (it's generally good practice unless we want to create NAs)
+  mutate(exposure = case_when(
+    exposure %in% c("N", "n", "No", "no") ~ "No",
+    exposure %in% c("Y", "y", "Yes", "yes") ~ "Yes",
+    TRUE ~ exposure # the only other value is an NA so we could include this or we don't need to (it's generally good practice unless we want to create NAs)
   ))
 
-count(BloodType, weight_loss)
+count(BloodType, exposure)
 ```
 
 ### 2.3
 
-Check to see how many values `weight_loss` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process.
+Check to see how many values `exposure` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process.
 
 ```{r 2.3response}
-BloodType %>% count(weight_loss)
+BloodType %>% count(exposure)
 ```
 
 ### 2.4

From fa22ba1fcd7799d3b22e00de8e8befaa0f730732 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Mon, 23 Dec 2024 15:15:06 -0700
Subject: [PATCH 3/8] adding info about naming object bike to start

---
 modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
index 1fdfb7ac3..e0f052482 100644
--- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
+++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
@@ -21,7 +21,7 @@ library(jhur)
 library(naniar)
 ```
 
-Read in the bike data, you can use the URL or download the data.
+Read in the bike data, you can use the URL or download the data and save the data as an object called `bike`.
 
 Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program. 
 The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms

From 231126cc8657521717eeb00d17eb1d4552de77af Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Thu, 26 Dec 2024 13:32:34 -0700
Subject: [PATCH 4/8] adding show pct too gg miss plot

---
 modules/Data_Cleaning/Data_Cleaning.Rmd             |  4 ++--
 modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd
index 69c27b9b1..d18578690 100644
--- a/modules/Data_Cleaning/Data_Cleaning.Rmd
+++ b/modules/Data_Cleaning/Data_Cleaning.Rmd
@@ -155,10 +155,10 @@ miss_var_summary(bike)
 ## `naniar` plots
 
 The `gg_miss_var()` function creates a nice plot about the number of
-missing values for each variable, (need a data frame).
+missing values for each variable, (need a data frame). Using `show_pct = TRUE` shows the percent missing. 
 
 ```{r, fig.height=4, warning=FALSE, fig.align='center'}
-gg_miss_var(bike)
+gg_miss_var(bike, show_pct = TRUE)
 ```
 
 
diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
index e0f052482..98f5e6505 100644
--- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
+++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
@@ -62,10 +62,10 @@ have_rout <- bike %>% drop_na(route)
 
 ### 1.3
 
-Use `naniar` to make a visual of the amount of data missing for each variable of `bike` (use `gg_miss_var()`). Check out more about this package here: https://www.njtierney.com/post/2018/06/12/naniar-on-cran/
+Use `naniar` to make a visual of the amount of data missing for each variable of `bike` (use `gg_miss_var()` and use `show_ptc = TRUE` as an argument ). Check out more about this package here: https://www.njtierney.com/post/2018/06/12/naniar-on-cran/
 
 ```{r 1.3response}
-gg_miss_var(bike)
+gg_miss_var(bike, show_pct = TRUE)
 ```
 
 
@@ -80,6 +80,14 @@ pull(bike, subType) %>% pct_complete() # this
 miss_var_summary(bike) # or this
 ```
 
+## P.2
+
+Use the `na_if` function to replace values of
+
+```{r}
+
+```
+
 
 # Part 2
 

From bbc8871dfee2159b8b8866359a706a10d6928925 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Thu, 26 Dec 2024 13:35:00 -0700
Subject: [PATCH 5/8] adding na_if question

---
 modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
index 98f5e6505..6cd514b96 100644
--- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
+++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd
@@ -82,13 +82,17 @@ miss_var_summary(bike) # or this
 
 ## P.2
 
-Use the `na_if` function to replace values of
+Use the `na_if` function to replace values of 0 values  of the`dateInstalled` variable to be `NA`. Check your work using the `count` function.
 
 ```{r}
-
+bike <- bike %>% 
+  mutate(dateInstalled = na_if(dateInstalled, 0))
+count(bike, dateInstalled)
 ```
 
 
+
+
 # Part 2
 
 **New Data set**

From 5a22443a53e5211946f762075b2f2afdcfc97488 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Thu, 26 Dec 2024 14:07:52 -0700
Subject: [PATCH 6/8] adding Gut Checks

---
 modules/Data_Cleaning/Data_Cleaning.Rmd | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd
index d18578690..b07ffdeaf 100644
--- a/modules/Data_Cleaning/Data_Cleaning.Rmd
+++ b/modules/Data_Cleaning/Data_Cleaning.Rmd
@@ -361,6 +361,20 @@ Pay attention to your data and your `NA` values!
 knitr::include_graphics("images/debug.png")
 ```
 
+## GUT CHECK: What function can be used to remove NA values from a full dataframe or for an individual column?
+
+A. `drop_nulls()`
+
+B. `drop_na()`
+
+C. `rem_na()`
+
+## GUT CHECK: How can you keep NA values when using `filter`?
+
+A. include `| is.na()` 
+
+B. include `& is.na()`
+
 ## Summary
 
 -   `is.na()`,`any(is.na())`, `all(is.na())`,`count()`, and functions from `naniar` like `gg_miss_var()` and `miss_var_summary` can help determine if we have `NA` values

From b49effd2439bb8f74ba2c7470da9aee26fcec4a2 Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Thu, 26 Dec 2024 14:12:11 -0700
Subject: [PATCH 7/8] adding cheatsheets

---
 modules/Data_Cleaning/Data_Cleaning.Rmd | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd
index b07ffdeaf..2745e5d2b 100644
--- a/modules/Data_Cleaning/Data_Cleaning.Rmd
+++ b/modules/Data_Cleaning/Data_Cleaning.Rmd
@@ -390,9 +390,11 @@ B. include `& is.na()`
 
 ## Lab Part 1
 
-🏠 [Class Website](https://jhudatascience.org/intro_to_r/)    
+🏠 [Class Website](https://jhudatascience.org/intro_to_r/)  
+
 💻[Lab](https://jhudatascience.org/intro_to_r/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd)
 
+📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf)
 
 
 # Recoding Variables
@@ -798,9 +800,12 @@ knitr::include_graphics("images/case_when.png")
 
 ## Lab Part 2
 
-🏠 [Class Website](https://jhudatascience.org/intro_to_r/)    
+🏠 [Class Website](https://jhudatascience.org/intro_to_r/)  
+
 💻[Lab](https://jhudatascience.org/intro_to_r/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd)
 
+📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf)
+
 ```{r, fig.alt="The End", out.width = "50%", echo = FALSE, fig.align='center'}
 knitr::include_graphics(here::here("images/the-end-g23b994289_1280.jpg"))
 ```

From be75ba60362c49d46c4de256a560ede5370d60db Mon Sep 17 00:00:00 2001
From: carriewright11 <cwright2@fredhutch.org>
Date: Thu, 26 Dec 2024 14:14:17 -0700
Subject: [PATCH 8/8] adding cheatsheets and gut checks

---
 modules/Data_Cleaning/Data_Cleaning.Rmd | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd
index 2745e5d2b..d3c407416 100644
--- a/modules/Data_Cleaning/Data_Cleaning.Rmd
+++ b/modules/Data_Cleaning/Data_Cleaning.Rmd
@@ -618,6 +618,13 @@ data_ginger_mint %>%
   count(Group, Effect)
 ```
 
+## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `TRUE ~` statement?
+
+A. With the name of the variable we are modifying or using as source
+
+B. With the word "same"
+
+
 # Working with strings
 
 ## Strings in R
@@ -734,7 +741,7 @@ data_ginger_mint %>%
   count(Treatment, Treatment_recoded)
 ```
 
-This is a more robust solution! It will catch typos as long as first letter is correct or there is part of the word mint.
+This is a more robust solution! It will catch typos as long as the first letter is correct or there is part of the word mint.
 
 ## That's better!
 
@@ -742,6 +749,15 @@ This is a more robust solution! It will catch typos as long as first letter is c
 knitr::include_graphics("https://media1.giphy.com/media/S9ZK4mmi3u3jdc5dek/200w.webp?cid=ecf05e47h7myga959jwvek6s9x1tkog135g7pxu8vvjz2yqb&rid=200w.webp&ct=g")
 ```
 
+
+## GUT CHECK: What `stringr` function helps us find a string pattern?
+
+A. `str_replace()`
+
+B. `str_find()`
+
+C. `str_detect()`
+
 # Separating and uniting data
 
 ## Uniting columns 
@@ -806,6 +822,8 @@ knitr::include_graphics("images/case_when.png")
 
 📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf)
 
+📃 [Posit's `stringr` Cheatsheet](https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_strings.pdf)
+
 ```{r, fig.alt="The End", out.width = "50%", echo = FALSE, fig.align='center'}
 knitr::include_graphics(here::here("images/the-end-g23b994289_1280.jpg"))
 ```