From 7eaa06dc283934571ac8fe7cc68e0c63ab130cdb Mon Sep 17 00:00:00 2001 From: Josep Pueyo-Ros <56930644+jospueyo@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:50:28 +0100 Subject: [PATCH] add set_labels to clean_names() (#564) * #563 add set_labels to clean names * #563 add set_labels to clean names * revert change in test-clean-names line 621 * minor corrections after PR #563 review * add second group of requests #564 * adapted labels to issue with 'sf_column' not being last --------- Co-authored-by: jospueyo Co-authored-by: Bill Denney --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/clean_names.R | 22 ++++++++++++++--- man/clean_names.Rd | 9 ++++--- man/janitor-package.Rd | 1 + tests/testthat/test-clean-names.R | 40 +++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e0d9ac4a..715ff2f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,8 @@ Authors@R: c( person("Ryan", "Knight", , "ryangknight@gmail.com", role = "ctb"), person("Malte", "Grosser", , "malte.grosser@gmail.com", role = "ctb"), person("Jonathan", "Zadra", , "jonathan.zadra@sorensonimpact.com", role = "ctb"), - person("Olivier", "Roy", role = "ctb") + person("Olivier", "Roy", role = "ctb"), + person("Josep", family = "Pueyo-Ros", email = "josep.pueyo@udg.edu", role = "ctb") ) Description: The main janitor functions can: perfectly format data.frame column names; provide quick counts of variable combinations (i.e., diff --git a/NEWS.md b/NEWS.md index 3346e46b..d25f40e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ These are all minor breaking changes resulting from enhancements and are not exp * The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.) +* A new argument `set_labels` to `clean_names()` stores the old names as labels in each column. Variable labels are visualized in Rstudio's data viewer or used by default by some packages such as `gt` instead of variable names. Labels can also be used in ggplot labels thanks to the function `easy_labs()` in the `ggeasy` package. Read this wonderful [post](https://www.pipinghotdata.com/posts/2022-09-13-the-case-for-variable-labels-in-r/) for more info about column labels. (#563, thanks to **@jospueyo** for the feature). + ## Bug fixes * `adorn_totals("row")` now succeeds if the new `name` of the totals row is already a factor level of the input data.frame (#529, thanks @egozoglu for reporting). diff --git a/R/clean_names.R b/R/clean_names.R index ee8efe6e..388144d8 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -24,6 +24,7 @@ #' (characters) to "u". #' #' @param dat The input `data.frame`. +#' @param set_labels If set to `TRUE`, old names are stored as labels in each column of the returned data.frame. #' @inheritDotParams make_clean_names -string #' @return A `data.frame` with clean names. #' @@ -32,7 +33,8 @@ #' support using `clean_names()` on `sf` and `tbl_graph` (from #' `tidygraph`) objects as well as on database connections through #' `dbplyr`. For cleaning other named objects like named lists -#' and vectors, use `make_clean_names()`. +#' and vectors, use `make_clean_names()`. When `set_labels` is set to `TRUE`, the old names, +#' stored as column labels, can be restored using `sjlabelled::label_to_colnames()`. #' #' @export #' @family Set names @@ -71,7 +73,7 @@ clean_names <- function(dat, ...) { #' @rdname clean_names #' @export -clean_names.default <- function(dat, ...) { +clean_names.default <- function(dat, ..., set_labels = FALSE) { if (is.null(names(dat)) && is.null(dimnames(dat))) { stop( "`clean_names()` requires that either names or dimnames be non-null.", @@ -81,14 +83,21 @@ clean_names.default <- function(dat, ...) { if (is.null(names(dat))) { dimnames(dat) <- lapply(dimnames(dat), make_clean_names, ...) } else { + if (set_labels){ + old_names <- names(dat) + for (i in seq_along(old_names)){ + attr(dat[[i]], "label") <- old_names[[i]] + } + } names(dat) <- make_clean_names(names(dat), ...) + } dat } #' @rdname clean_names #' @export -clean_names.sf <- function(dat, ...) { +clean_names.sf <- function(dat, ..., set_labels = FALSE) { if (!requireNamespace("sf", quietly = TRUE)) { # nocov start stop( "Package 'sf' needed for this function to work. Please install it.", @@ -103,6 +112,12 @@ clean_names.sf <- function(dat, ...) { sf_cleaned <- make_clean_names(sf_names[cols_to_rename], ...) # rename original df names(dat)[cols_to_rename] <- sf_cleaned + + if(set_labels){ + for (i in seq_along(sf_names[cols_to_rename])){ + attr(dat[[i]], "label") <- sf_names[[i]] + } + } dat } @@ -116,6 +131,7 @@ clean_names.tbl_graph <- function(dat, ...) { call. = FALSE ) } # nocov end + dplyr::rename_all(dat, .funs = make_clean_names, ...) } diff --git a/man/clean_names.Rd b/man/clean_names.Rd index 23579ffb..a31681a0 100644 --- a/man/clean_names.Rd +++ b/man/clean_names.Rd @@ -10,9 +10,9 @@ \usage{ clean_names(dat, ...) -\method{clean_names}{default}(dat, ...) +\method{clean_names}{default}(dat, ..., set_labels = FALSE) -\method{clean_names}{sf}(dat, ...) +\method{clean_names}{sf}(dat, ..., set_labels = FALSE) \method{clean_names}{tbl_graph}(dat, ...) @@ -65,6 +65,8 @@ You should use this feature with care in case of \code{case = "parsed"}, \code{c might not always be what is intended. In this case you can make usage of the option to supply named elements and specify the transliterations yourself.} \item{\code{numerals}}{A character specifying the alignment of numerals (\code{"middle"}, \code{left}, \code{right}, \code{asis} or \code{tight}). I.e. \code{numerals = "left"} ensures that no output separator is in front of a digit.} }} + +\item{set_labels}{If set to \code{TRUE}, old names are stored as labels in each column of the returned data.frame.} } \value{ A \code{data.frame} with clean names. @@ -98,7 +100,8 @@ and \code{data.frame}-like objects. For this reason there are methods to support using \code{clean_names()} on \code{sf} and \code{tbl_graph} (from \code{tidygraph}) objects as well as on database connections through \code{dbplyr}. For cleaning other named objects like named lists -and vectors, use \code{make_clean_names()}. +and vectors, use \code{make_clean_names()}. When \code{set_labels} is set to \code{TRUE}, the old names, +stored as column labels, can be restored using \code{sjlabelled::label_to_colnames()}. } \examples{ diff --git a/man/janitor-package.Rd b/man/janitor-package.Rd index f397d9b1..9c3d6ce5 100644 --- a/man/janitor-package.Rd +++ b/man/janitor-package.Rd @@ -34,6 +34,7 @@ Other contributors: \item Malte Grosser \email{malte.grosser@gmail.com} [contributor] \item Jonathan Zadra \email{jonathan.zadra@sorensonimpact.com} [contributor] \item Olivier Roy [contributor] + \item Josep Pueyo-Ros \email{josep.pueyo@udg.edu} [contributor] } } diff --git a/tests/testthat/test-clean-names.R b/tests/testthat/test-clean-names.R index 281226da..15bb942a 100644 --- a/tests/testthat/test-clean-names.R +++ b/tests/testthat/test-clean-names.R @@ -186,6 +186,22 @@ test_that("do not create duplicates (fix #251)", { ) }) +test_that("labels are created in default method (feature request #563)", { + dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) + dat_df_clean_labels <- clean_names(dat_df, set_labels = TRUE) + dat_df_clean <- clean_names(dat_df) + + for (i in seq_along(names(dat_df))){ + # check that old names are saved as labels when set_labels is TRUE + expect_equal(attr(dat_df_clean_labels[[i]], "label"), names(dat_df)[[i]]) + # check that old names are not stored if set_labels is not TRUE + expect_null(attr(dat_df_clean[[i]], "label")) + } + + # expect names are always cleaned + expect_equal(names(dat_df_clean), c("a_a", "b_b")) + expect_equal(names(dat_df_clean_labels), c("a_a", "b_b")) +}) test_that("allow for duplicates (fix #495)", { expect_equal( @@ -587,6 +603,30 @@ test_that("Tests for cases beyond default snake for sf objects", { ) }) +test_that("labels are created in sf method (feature request #563)", { + skip_if_not_installed("sf") + + dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) + dat_sf <- dat_df + dat_sf$x <- c(1,2) + dat_sf$y <- c(1,2) + dat_sf <- sf::st_as_sf(dat_sf, coords = c("x", "y")) + dat_sf_clean_labels <- clean_names(dat_sf, set_labels = TRUE) + dat_sf_clean <- clean_names(dat_sf) + + for (i in seq_along(names(dat_df))){ + # check that old names are saved as labels when set_labels is TRUE + expect_equal(attr(dat_sf_clean_labels[[i]], "label"), names(dat_sf)[[i]]) + + # check that old names are not stored if set_labels is not TRUE + expect_null(attr(dat_sf_clean[[i]], "label")) + } + # expect names are always cleaned + expect_equal(names(dat_sf_clean), c("a_a", "b_b", "geometry")) + expect_equal(names(dat_sf_clean_labels), c("a_a", "b_b", "geometry")) +}) + + #------------------------------------------------------------------------------# #------------------------ Tests for tbl_graph method --------------------------##### #------------------------------------------------------------------------------#