From a9c75d23a39e3bfd741c07d295c14ed617dee4fb Mon Sep 17 00:00:00 2001 From: Onur Satici Date: Wed, 29 Jan 2025 13:28:25 +0000 Subject: [PATCH] fix infering arrow lists with nullable elements (#2112) --- vortex-array/src/arrow/dtype.rs | 32 ++++++++++++++++++++++++++++++-- vortex-array/src/canonical.rs | 2 +- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/vortex-array/src/arrow/dtype.rs b/vortex-array/src/arrow/dtype.rs index 7e0a2be3b1..06061d5016 100644 --- a/vortex-array/src/arrow/dtype.rs +++ b/vortex-array/src/arrow/dtype.rs @@ -116,6 +116,8 @@ pub fn infer_schema(dtype: &DType) -> VortexResult { } /// Try to convert a Vortex [`DType`] into an Arrow [`DataType`] +/// Top level nulltability from the DType is dropped, Arrow represents +/// nullability for a DataType in [`Field`] pub fn infer_data_type(dtype: &DType) -> VortexResult { Ok(match dtype { DType::Null => DataType::Null, @@ -150,9 +152,9 @@ pub fn infer_data_type(dtype: &DType) -> VortexResult { // There are four kinds of lists: List (32-bit offsets), Large List (64-bit), List View // (32-bit), Large List View (64-bit). We cannot both guarantee zero-copy and commit to an // Arrow dtype because we do not how large our offsets are. - DType::List(l, null) => DataType::List(FieldRef::new(Field::new_list_field( + DType::List(l, _) => DataType::List(FieldRef::new(Field::new_list_field( infer_data_type(l.as_ref())?, - (*null).into(), + l.nullability().into(), ))), DType::Extension(ext_dtype) => { // Try and match against the known extension DTypes. @@ -214,6 +216,32 @@ mod test { ); } + #[test] + fn infer_nullable_list_element() { + let list_non_nullable = DType::List( + Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), + Nullability::Nullable, + ); + + let arrow_list_non_nullable = infer_data_type(&list_non_nullable).unwrap(); + + let list_nullable = DType::List( + Arc::new(DType::Primitive(PType::I64, Nullability::Nullable)), + Nullability::Nullable, + ); + let arrow_list_nullable = infer_data_type(&list_nullable).unwrap(); + + assert_ne!(arrow_list_non_nullable, arrow_list_nullable); + assert_eq!( + arrow_list_nullable, + DataType::new_list(DataType::Int64, true) + ); + assert_eq!( + arrow_list_non_nullable, + DataType::new_list(DataType::Int64, false) + ); + } + #[test] #[should_panic] fn test_dtype_conversion_panics() { diff --git a/vortex-array/src/canonical.rs b/vortex-array/src/canonical.rs index c724a00b32..1e91bddc2f 100644 --- a/vortex-array/src/canonical.rs +++ b/vortex-array/src/canonical.rs @@ -69,7 +69,7 @@ pub enum Canonical { impl Canonical { // Create an empty canonical array of the given dtype. pub fn empty(dtype: &DType) -> Canonical { - Self::try_empty(dtype).vortex_expect("Cannot build an empty array") + Self::try_empty(dtype).vortex_expect("Cannot fail to build an empty array") } pub fn try_empty(dtype: &DType) -> VortexResult {