From 67934d2f2cded5af53e0388dd359ebba3421f910 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Mon, 23 Dec 2024 15:01:03 -0700 Subject: [PATCH 1/8] fixing packages --- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 8fed610d2..541003f8e 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -15,13 +15,8 @@ The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gy You can Download as a CSV in your current working directory. Note its also available at: http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv ```{r} -library(readr) library(tidyverse) -library(dplyr) -library(lubridate) library(jhur) -library(tidyverse) -library(broom) # install.packages("naniar") library(naniar) ``` From bfdf6a9d0ac0ccd9a2365c36d8454da441964856 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Mon, 23 Dec 2024 15:04:00 -0700 Subject: [PATCH 2/8] fixing cleaning lab variable to exposure --- .../lab/Data_Cleaning_Lab_Key.Rmd | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 541003f8e..1fdfb7ac3 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -91,7 +91,7 @@ Let's say we the data like so: ```{r} BloodType <- tibble( - weight_loss = + exposure = c( "Y", "No", "Yes", "y", "no", "n", "No", "N", "yes", "Yes", @@ -116,20 +116,20 @@ There are some issues with this data that we need to figure out! ### 2.1 -Determine how many `NA` values there are for `weight_loss` (assume you know that`N` and `n` is for no). +Determine how many `NA` values there are for `exposure` (assume you know that`N` and `n` is for no). ```{r 2.1response} -count(BloodType, weight_loss) # the simple way -sum(is.na(pull(BloodType, weight_loss))) # another way +count(BloodType, exposure) # the simple way +sum(is.na(pull(BloodType, exposure))) # another way BloodType %>% # another way - pull(weight_loss) %>% + pull(exposure) %>% is.na() %>% sum() ``` ### 2.2 -Recode the `weight_loss` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Keep "Other" as "Other". Don't forget to use quotes! +Recode the `exposure` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Keep "Other" as "Other". Don't forget to use quotes! ``` # General format @@ -144,21 +144,21 @@ NEW_TIBBLE <- OLD_TIBBLE %>% ```{r 2.2response} BloodType <- BloodType %>% - mutate(weight_loss = case_when( - weight_loss %in% c("N", "n", "No", "no") ~ "No", - weight_loss %in% c("Y", "y", "Yes", "yes") ~ "Yes", - TRUE ~ weight_loss # the only other value is an NA so we could include this or we don't need to (it's generally good practice unless we want to create NAs) + mutate(exposure = case_when( + exposure %in% c("N", "n", "No", "no") ~ "No", + exposure %in% c("Y", "y", "Yes", "yes") ~ "Yes", + TRUE ~ exposure # the only other value is an NA so we could include this or we don't need to (it's generally good practice unless we want to create NAs) )) -count(BloodType, weight_loss) +count(BloodType, exposure) ``` ### 2.3 -Check to see how many values `weight_loss` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. +Check to see how many values `exposure` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. ```{r 2.3response} -BloodType %>% count(weight_loss) +BloodType %>% count(exposure) ``` ### 2.4 From fa22ba1fcd7799d3b22e00de8e8befaa0f730732 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Mon, 23 Dec 2024 15:15:06 -0700 Subject: [PATCH 3/8] adding info about naming object bike to start --- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 1fdfb7ac3..e0f052482 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -21,7 +21,7 @@ library(jhur) library(naniar) ``` -Read in the bike data, you can use the URL or download the data. +Read in the bike data, you can use the URL or download the data and save the data as an object called `bike`. Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program. The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms From 231126cc8657521717eeb00d17eb1d4552de77af Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Thu, 26 Dec 2024 13:32:34 -0700 Subject: [PATCH 4/8] adding show pct too gg miss plot --- modules/Data_Cleaning/Data_Cleaning.Rmd | 4 ++-- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 69c27b9b1..d18578690 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -155,10 +155,10 @@ miss_var_summary(bike) ## `naniar` plots The `gg_miss_var()` function creates a nice plot about the number of -missing values for each variable, (need a data frame). +missing values for each variable, (need a data frame). Using `show_pct = TRUE` shows the percent missing. ```{r, fig.height=4, warning=FALSE, fig.align='center'} -gg_miss_var(bike) +gg_miss_var(bike, show_pct = TRUE) ``` diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index e0f052482..98f5e6505 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -62,10 +62,10 @@ have_rout <- bike %>% drop_na(route) ### 1.3 -Use `naniar` to make a visual of the amount of data missing for each variable of `bike` (use `gg_miss_var()`). Check out more about this package here: https://www.njtierney.com/post/2018/06/12/naniar-on-cran/ +Use `naniar` to make a visual of the amount of data missing for each variable of `bike` (use `gg_miss_var()` and use `show_ptc = TRUE` as an argument ). Check out more about this package here: https://www.njtierney.com/post/2018/06/12/naniar-on-cran/ ```{r 1.3response} -gg_miss_var(bike) +gg_miss_var(bike, show_pct = TRUE) ``` @@ -80,6 +80,14 @@ pull(bike, subType) %>% pct_complete() # this miss_var_summary(bike) # or this ``` +## P.2 + +Use the `na_if` function to replace values of + +```{r} + +``` + # Part 2 From bbc8871dfee2159b8b8866359a706a10d6928925 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Thu, 26 Dec 2024 13:35:00 -0700 Subject: [PATCH 5/8] adding na_if question --- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 98f5e6505..6cd514b96 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -82,13 +82,17 @@ miss_var_summary(bike) # or this ## P.2 -Use the `na_if` function to replace values of +Use the `na_if` function to replace values of 0 values of the`dateInstalled` variable to be `NA`. Check your work using the `count` function. ```{r} - +bike <- bike %>% + mutate(dateInstalled = na_if(dateInstalled, 0)) +count(bike, dateInstalled) ``` + + # Part 2 **New Data set** From 5a22443a53e5211946f762075b2f2afdcfc97488 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Thu, 26 Dec 2024 14:07:52 -0700 Subject: [PATCH 6/8] adding Gut Checks --- modules/Data_Cleaning/Data_Cleaning.Rmd | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index d18578690..b07ffdeaf 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -361,6 +361,20 @@ Pay attention to your data and your `NA` values! knitr::include_graphics("images/debug.png") ``` +## GUT CHECK: What function can be used to remove NA values from a full dataframe or for an individual column? + +A. `drop_nulls()` + +B. `drop_na()` + +C. `rem_na()` + +## GUT CHECK: How can you keep NA values when using `filter`? + +A. include `| is.na()` + +B. include `& is.na()` + ## Summary - `is.na()`,`any(is.na())`, `all(is.na())`,`count()`, and functions from `naniar` like `gg_miss_var()` and `miss_var_summary` can help determine if we have `NA` values From b49effd2439bb8f74ba2c7470da9aee26fcec4a2 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Thu, 26 Dec 2024 14:12:11 -0700 Subject: [PATCH 7/8] adding cheatsheets --- modules/Data_Cleaning/Data_Cleaning.Rmd | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index b07ffdeaf..2745e5d2b 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -390,9 +390,11 @@ B. include `& is.na()` ## Lab Part 1 -🏠 [Class Website](https://jhudatascience.org/intro_to_r/) +🏠 [Class Website](https://jhudatascience.org/intro_to_r/) + 💻[Lab](https://jhudatascience.org/intro_to_r/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd) +📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf) # Recoding Variables @@ -798,9 +800,12 @@ knitr::include_graphics("images/case_when.png") ## Lab Part 2 -🏠 [Class Website](https://jhudatascience.org/intro_to_r/) +🏠 [Class Website](https://jhudatascience.org/intro_to_r/) + 💻[Lab](https://jhudatascience.org/intro_to_r/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd) +📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf) + ```{r, fig.alt="The End", out.width = "50%", echo = FALSE, fig.align='center'} knitr::include_graphics(here::here("images/the-end-g23b994289_1280.jpg")) ``` From be75ba60362c49d46c4de256a560ede5370d60db Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Thu, 26 Dec 2024 14:14:17 -0700 Subject: [PATCH 8/8] adding cheatsheets and gut checks --- modules/Data_Cleaning/Data_Cleaning.Rmd | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 2745e5d2b..d3c407416 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -618,6 +618,13 @@ data_ginger_mint %>% count(Group, Effect) ``` +## GUT CHECK: If we want all unspecified values to remain the same with `case_when()`, how should we complete the `TRUE ~` statement? + +A. With the name of the variable we are modifying or using as source + +B. With the word "same" + + # Working with strings ## Strings in R @@ -734,7 +741,7 @@ data_ginger_mint %>% count(Treatment, Treatment_recoded) ``` -This is a more robust solution! It will catch typos as long as first letter is correct or there is part of the word mint. +This is a more robust solution! It will catch typos as long as the first letter is correct or there is part of the word mint. ## That's better! @@ -742,6 +749,15 @@ This is a more robust solution! It will catch typos as long as first letter is c knitr::include_graphics("https://media1.giphy.com/media/S9ZK4mmi3u3jdc5dek/200w.webp?cid=ecf05e47h7myga959jwvek6s9x1tkog135g7pxu8vvjz2yqb&rid=200w.webp&ct=g") ``` + +## GUT CHECK: What `stringr` function helps us find a string pattern? + +A. `str_replace()` + +B. `str_find()` + +C. `str_detect()` + # Separating and uniting data ## Uniting columns @@ -806,6 +822,8 @@ knitr::include_graphics("images/case_when.png") 📃 [Day 5 Cheatsheet](https://jhudatascience.org/intro_to_r/modules/cheatsheets/Day-5.pdf) +📃 [Posit's `stringr` Cheatsheet](https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_strings.pdf) + ```{r, fig.alt="The End", out.width = "50%", echo = FALSE, fig.align='center'} knitr::include_graphics(here::here("images/the-end-g23b994289_1280.jpg")) ```