From 6596662ae6b7ec8254462a1ecf840de13f3bffeb Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:45:14 +0200 Subject: [PATCH 1/6] init [skip ci] --- R/datatype.R | 53 +++++++++++++++++++++++++++++++++++ R/extendr-wrappers.R | 2 ++ man/DataType_Enum.Rd | 58 +++++++++++++++++++++++++++++++++++++++ man/pl_pl.Rd | 2 +- src/rust/src/rdatatype.rs | 11 ++++++++ 5 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 man/DataType_Enum.Rd diff --git a/R/datatype.R b/R/datatype.R index c90fe24b7..8545ac559 100644 --- a/R/datatype.R +++ b/R/datatype.R @@ -145,6 +145,7 @@ DataType_constructors = function() { list( Array = DataType_Array, Categorical = DataType_Categorical, + Enum = DataType_Enum, Datetime = DataType_Datetime, Duration = DataType_Duration, List = DataType_List, @@ -362,6 +363,58 @@ DataType_Categorical = function(ordering = "physical") { .pr$DataType$new_categorical(ordering) |> unwrap() } +#' Create Enum DataType +#' +#' An `Enum` is a fixed set categorical encoding of a set of strings. It is +#' similar to the [`Categorical`][DataType_Categorical] data type, but the +#' categories are explicitly provided by the user and cannot be modified. +#' +#' This functionality is **unstable**. It is a work-in-progress feature and may +#' not always work as expected. It may be changed at any point without it being +#' considered a breaking change. +#' +#' @param categories A character vector specifying the categories of the variable. +#' +#' +#' @return An Enum DataType +#' @examples +#' pl$DataFrame( +#' x = c("Polar", "Panda", "Brown", "Brown", "Polar"), +#' schema = list(x = pl$Enum(c("Polar", "Panda", "Brown"))) +#' ) +#' +#' # All values of the variable have to be in the categories +#' dtype = pl$Enum(c("Polar", "Panda", "Brown")) +#' pl$DataFrame( +#' x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), +#' schema = list(x = dtype) +#' ) +#' +#' # Comparing two Enum is only valid if they have the same categories +#' df = pl$DataFrame( +#' x = c("Polar", "Panda", "Brown", "Brown", "Polar"), +#' y = c("Polar", "Polar", "Polar", "Brown", "Brown"), +#' z = c("Polar", "Polar", "Polar", "Brown", "Brown"), +#' schema = list( +#' x = pl$Enum(c("Polar", "Panda", "Brown")), +#' y = pl$Enum(c("Polar", "Panda", "Brown")), +#' z = pl$Enum(c("Polar", "Black", "Brown")) +#' ) +#' ) +#' +#' # Same categories +#' df$with_columns(x_eq_y = pl$col("x") == pl$col("y")) +#' +#' # Different categories +#' tryCatch( +#' df$with_columns(x_eq_z = pl$col("x") == pl$col("z")), +#' error = function(e) e +#' ) +DataType_Enum = function(categories) { + .pr$DataType$new_enum(categories) |> unwrap() +} + + #' Check whether the data type is a temporal type #' #' @return A logical value diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 3dcf18657..6a74856e3 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -276,6 +276,8 @@ RPolarsDataType$new <- function(s) .Call(wrap__RPolarsDataType__new, s) RPolarsDataType$new_categorical <- function(ordering) .Call(wrap__RPolarsDataType__new_categorical, ordering) +RPolarsDataType$new_enum <- function(categories) .Call(wrap__RPolarsDataType__new_enum, categories) + RPolarsDataType$new_datetime <- function(tu, tz) .Call(wrap__RPolarsDataType__new_datetime, tu, tz) RPolarsDataType$new_duration <- function(tu) .Call(wrap__RPolarsDataType__new_duration, tu) diff --git a/man/DataType_Enum.Rd b/man/DataType_Enum.Rd new file mode 100644 index 000000000..9c3fe7d71 --- /dev/null +++ b/man/DataType_Enum.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datatype.R +\name{DataType_Enum} +\alias{DataType_Enum} +\title{Create Enum DataType} +\usage{ +DataType_Enum(categories) +} +\arguments{ +\item{categories}{A character vector specifying the categories of the variable.} +} +\value{ +An Enum DataType +} +\description{ +An \code{Enum} is a fixed set categorical encoding of a set of strings. It is +similar to the \code{\link[=DataType_Categorical]{Categorical}} data type, but the +categories are explicitly provided by the user and cannot be modified. +} +\details{ +This functionality is \strong{unstable}. It is a work-in-progress feature and may +not always work as expected. It may be changed at any point without it being +considered a breaking change. +} +\examples{ +pl$DataFrame( + x = c("Polar", "Panda", "Brown", "Brown", "Polar"), + schema = list(x = pl$Enum(c("Polar", "Panda", "Brown"))) +) + +# All values of the variable have to be in the categories +dtype = pl$Enum(c("Polar", "Panda", "Brown")) +pl$DataFrame( + x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), + schema = list(x = dtype) +) + +# Comparing two Enum is only valid if they have the same categories +df = pl$DataFrame( + x = c("Polar", "Panda", "Brown", "Brown", "Polar"), + y = c("Polar", "Polar", "Polar", "Brown", "Brown"), + z = c("Polar", "Polar", "Polar", "Brown", "Brown"), + schema = list( + x = pl$Enum(c("Polar", "Panda", "Brown")), + y = pl$Enum(c("Polar", "Panda", "Brown")), + z = pl$Enum(c("Polar", "Black", "Brown")) + ) +) + +# Same categories +df$with_columns(x_eq_y = pl$col("x") == pl$col("y")) + +# Different categories +tryCatch( + df$with_columns(x_eq_z = pl$col("x") == pl$col("z")), + error = function(e) e +) +} diff --git a/man/pl_pl.Rd b/man/pl_pl.Rd index 52a0396ee..8f4582750 100644 --- a/man/pl_pl.Rd +++ b/man/pl_pl.Rd @@ -6,7 +6,7 @@ \alias{pl} \title{The complete polars public API.} \format{ -An object of class \code{pl_polars_env} (inherits from \code{environment}) of length 106. +An object of class \code{pl_polars_env} (inherits from \code{environment}) of length 107. } \usage{ pl diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 375c63fa5..bcac5c2de 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -79,6 +79,7 @@ impl RPolarsDataType { "Time" | "time" => pl::DataType::Time, "Null" | "null" => pl::DataType::Null, "Categorical" | "factor" => pl::DataType::Categorical(None, Default::default()), + "Enum" => pl::DataType::Enum(None, Default::default()), "Unknown" | "unknown" => pl::DataType::Unknown, _ => panic!("data type not recgnized "), @@ -91,6 +92,16 @@ impl RPolarsDataType { Ok(RPolarsDataType(pl::DataType::Categorical(None, ordering))) } + pub fn new_enum(categories: Robj) -> RResult { + use crate::conversion_r_to_s::robjname2series; + let s = robjname2series(categories, "").unwrap(); + let ca = s.str()?; + let categories = ca.downcast_iter().next().unwrap().clone(); + Ok(RPolarsDataType(pl::datatypes::create_enum_data_type( + categories, + ))) + } + pub fn new_datetime(tu: Robj, tz: Nullable) -> RResult { robj_to!(timeunit, tu) .map(|dt| RPolarsDataType(pl::DataType::Datetime(dt, null_to_opt(tz)))) From a38a858973f1a1e9437218b8c8408d957097462d Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:59:57 +0200 Subject: [PATCH 2/6] minor [skip ci] --- src/rust/src/rdatatype.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index bcac5c2de..c74890385 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -214,6 +214,20 @@ impl RPolarsDataType { self.0.is_temporal() } + // When rust-polars 0.40.0 is released: + + // pub fn is_enum(&self) -> bool { + // self.0.is_enum() + // } + + // pub fn is_categorical(&self) -> bool { + // self.0.is_categorical() + // } + + // pub fn is_string(&self) -> bool { + // self.0.is_string() + // } + pub fn is_logical(&self) -> bool { self.0.is_logical() } From 4c1649f4698c2daf07e0afe43316c4ec49157f8d Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:33:04 +0200 Subject: [PATCH 3/6] docs --- R/datatype.R | 10 ++++++---- man/DataType_Enum.Rd | 9 ++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/datatype.R b/R/datatype.R index 8545ac559..a7b794d4a 100644 --- a/R/datatype.R +++ b/R/datatype.R @@ -375,7 +375,6 @@ DataType_Categorical = function(ordering = "physical") { #' #' @param categories A character vector specifying the categories of the variable. #' -#' #' @return An Enum DataType #' @examples #' pl$DataFrame( @@ -385,9 +384,12 @@ DataType_Categorical = function(ordering = "physical") { #' #' # All values of the variable have to be in the categories #' dtype = pl$Enum(c("Polar", "Panda", "Brown")) -#' pl$DataFrame( -#' x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), -#' schema = list(x = dtype) +#' tryCatch( +#' pl$DataFrame( +#' x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), +#' schema = list(x = dtype) +#' ), +#' error = function(e) e #' ) #' #' # Comparing two Enum is only valid if they have the same categories diff --git a/man/DataType_Enum.Rd b/man/DataType_Enum.Rd index 9c3fe7d71..f30695a4d 100644 --- a/man/DataType_Enum.Rd +++ b/man/DataType_Enum.Rd @@ -30,9 +30,12 @@ pl$DataFrame( # All values of the variable have to be in the categories dtype = pl$Enum(c("Polar", "Panda", "Brown")) -pl$DataFrame( - x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), - schema = list(x = dtype) +tryCatch( + pl$DataFrame( + x = c("Polar", "Panda", "Brown", "Brown", "Polar", "Black"), + schema = list(x = dtype) + ), + error = function(e) e ) # Comparing two Enum is only valid if they have the same categories From 1854b899cee7936961f276565de43117e8ce7b73 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:33:10 +0200 Subject: [PATCH 4/6] test --- src/rust/src/conversion_s_to_r.rs | 3 +++ tests/testthat/test-datatype.R | 39 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/rust/src/conversion_s_to_r.rs b/src/rust/src/conversion_s_to_r.rs index a0e0f8193..09c936ea8 100644 --- a/src/rust/src/conversion_s_to_r.rs +++ b/src/rust/src/conversion_s_to_r.rs @@ -110,6 +110,9 @@ pub fn pl_series_to_list( .set_class(["rpolars_raw_list", "list"]) .expect("this class label is always valid") }), + Enum(_, _) => s + .categorical() + .map(|ca| extendr_api::call!("factor", ca.iter_str().collect_robj()).unwrap()), Categorical(_, _) => s .categorical() .map(|ca| extendr_api::call!("factor", ca.iter_str().collect_robj()).unwrap()), diff --git a/tests/testthat/test-datatype.R b/tests/testthat/test-datatype.R index 7cd8f879e..d44bf7ca0 100644 --- a/tests/testthat/test-datatype.R +++ b/tests/testthat/test-datatype.R @@ -198,3 +198,42 @@ test_that("is_* functions for datatype work", { expect_false(pl$Struct()$is_primitive()) expect_false(pl$List()$is_primitive()) }) + +test_that("Enum", { + expect_identical( + as_polars_series(c("z", "z", "k", "a"))$ + cast(pl$Enum(c("z", "k", "a")))$ + to_r(), + factor(c("z", "z", "k", "a")) + ) + + expect_grepl_error(pl$Enum(), "missing") + expect_grepl_error(pl$Enum(1), "invalid series dtype") + expect_grepl_error(pl$Enum(TRUE), "invalid series dtype") + expect_grepl_error(pl$Enum(factor("a")), "invalid series dtype") + + expect_error( + as_polars_series(c("z", "z", "k", "a"))$ + cast(pl$Enum(c("foo", "k", "a"))), + "Ensure that all values in the input column are present" + ) + + # Can compare two cols if same Enum categories only + + df = pl$DataFrame(x = "a", y = "b", z = "c")$ + with_columns( + pl$col("x")$cast(pl$Enum(c("a", "b", "c"))), + pl$col("y")$cast(pl$Enum(c("a", "b", "c"))), + pl$col("z")$cast(pl$Enum(c("a", "c"))) + ) + + expect_identical( + df$select(x_eq_y = pl$col("x") == pl$col("y"))$to_list(), + list(x_eq_y = FALSE) + ) + + expect_grepl_error( + df$select(x_eq_z = pl$col("x") == pl$col("z")), + "cannot compare categoricals coming from different sources" + ) +}) From 637697500a2bf23992c11f52a7b176e9dd10b060 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:33:14 +0200 Subject: [PATCH 5/6] news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index de7fd44b9..cc35b6b7d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ### New features - `$cut()` and `$qcut()` to bin continuous values into discrete categories (#1057). +- Add support for the `Enum` datatype via `pl$Enum()` (#1061). ### Bug fixes From 0f39278f7e60171ef3af3b54881248fba3fa4f66 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:35:54 +0200 Subject: [PATCH 6/6] snapshot --- tests/testthat/_snaps/after-wrappers.md | 99 +++++++++++++------------ 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 514e2a805..3c753700e 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -7,55 +7,56 @@ [3] "Boolean" "Categorical" [5] "DataFrame" "Date" [7] "Datetime" "Duration" - [9] "Field" "Float32" - [11] "Float64" "Int16" - [13] "Int32" "Int64" - [15] "Int8" "LazyFrame" - [17] "List" "Null" - [19] "PTime" "SQLContext" - [21] "Series" "String" - [23] "Struct" "Time" - [25] "UInt16" "UInt32" - [27] "UInt64" "UInt8" - [29] "Unknown" "Utf8" - [31] "all" "all_horizontal" - [33] "any_horizontal" "approx_n_unique" - [35] "arg_sort_by" "arg_where" - [37] "class_names" "coalesce" - [39] "col" "concat" - [41] "concat_list" "concat_str" - [43] "corr" "count" - [45] "cov" "date" - [47] "date_range" "date_ranges" - [49] "datetime" "datetime_range" - [51] "datetime_ranges" "disable_string_cache" - [53] "dtypes" "duration" - [55] "element" "enable_string_cache" - [57] "first" "fold" - [59] "from_epoch" "get_global_rpool_cap" - [61] "head" "implode" - [63] "int_range" "int_ranges" - [65] "is_schema" "last" - [67] "len" "lit" - [69] "max" "max_horizontal" - [71] "mean" "mean_horizontal" - [73] "median" "mem_address" - [75] "min" "min_horizontal" - [77] "n_unique" "numeric_dtypes" - [79] "raw_list" "read_csv" - [81] "read_ipc" "read_ndjson" - [83] "read_parquet" "reduce" - [85] "rolling_corr" "rolling_cov" - [87] "same_outer_dt" "scan_csv" - [89] "scan_ipc" "scan_ndjson" - [91] "scan_parquet" "select" - [93] "set_global_rpool_cap" "show_all_public_functions" - [95] "show_all_public_methods" "std" - [97] "struct" "sum" - [99] "sum_horizontal" "tail" - [101] "thread_pool_size" "time" - [103] "using_string_cache" "var" - [105] "when" "with_string_cache" + [9] "Enum" "Field" + [11] "Float32" "Float64" + [13] "Int16" "Int32" + [15] "Int64" "Int8" + [17] "LazyFrame" "List" + [19] "Null" "PTime" + [21] "SQLContext" "Series" + [23] "String" "Struct" + [25] "Time" "UInt16" + [27] "UInt32" "UInt64" + [29] "UInt8" "Unknown" + [31] "Utf8" "all" + [33] "all_horizontal" "any_horizontal" + [35] "approx_n_unique" "arg_sort_by" + [37] "arg_where" "class_names" + [39] "coalesce" "col" + [41] "concat" "concat_list" + [43] "concat_str" "corr" + [45] "count" "cov" + [47] "date" "date_range" + [49] "date_ranges" "datetime" + [51] "datetime_range" "datetime_ranges" + [53] "disable_string_cache" "dtypes" + [55] "duration" "element" + [57] "enable_string_cache" "first" + [59] "fold" "from_epoch" + [61] "get_global_rpool_cap" "head" + [63] "implode" "int_range" + [65] "int_ranges" "is_schema" + [67] "last" "len" + [69] "lit" "max" + [71] "max_horizontal" "mean" + [73] "mean_horizontal" "median" + [75] "mem_address" "min" + [77] "min_horizontal" "n_unique" + [79] "numeric_dtypes" "raw_list" + [81] "read_csv" "read_ipc" + [83] "read_ndjson" "read_parquet" + [85] "reduce" "rolling_corr" + [87] "rolling_cov" "same_outer_dt" + [89] "scan_csv" "scan_ipc" + [91] "scan_ndjson" "scan_parquet" + [93] "select" "set_global_rpool_cap" + [95] "show_all_public_functions" "show_all_public_methods" + [97] "std" "struct" + [99] "sum" "sum_horizontal" + [101] "tail" "thread_pool_size" + [103] "time" "using_string_cache" + [105] "var" "when" + [107] "with_string_cache" ---