Skip to content

Commit

Permalink
Allow hints for upcasting parquet to arrow integer types
Browse files Browse the repository at this point in the history
  • Loading branch information
gruuya committed Dec 17, 2024
1 parent f5b51ff commit 4d39589
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
37 changes: 30 additions & 7 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,7 @@ mod tests {
};
use arrow_array::*;
use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime};
use arrow_cast::pretty::pretty_format_batches;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field, Fields, Schema, SchemaRef, TimeUnit,
Expand Down Expand Up @@ -3184,11 +3185,13 @@ mod tests {
let nested_fields = Fields::from(vec![
Field::new("utf8_to_dict", ArrowDataType::Utf8, false),
Field::new("int64_to_ts_nano", ArrowDataType::Int64, false),
Field::new("int16_to_int32", ArrowDataType::Int16, false),
]);

let nested_arrays: Vec<ArrayRef> = vec![
Arc::new(StringArray::from(vec!["a", "a", "a", "b"])) as ArrayRef,
Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
Arc::new(Int16Array::from(vec![1, 2, 3, 4])) as ArrayRef,
];

let nested = StructArray::try_new(nested_fields, nested_arrays, None).unwrap();
Expand All @@ -3202,6 +3205,10 @@ mod tests {
"date32_to_date64",
Arc::new(Date32Array::from(vec![0, 1, 2, 3])) as ArrayRef,
),
(
"int8_to_int64",
Arc::new(Int8Array::from(vec![0, 1, 2, 3])) as ArrayRef,
),
("nested", Arc::new(nested) as ArrayRef),
]);

Expand All @@ -3216,21 +3223,20 @@ mod tests {
),
Field::new(
"int64_to_ts_nano",
ArrowDataType::Timestamp(
arrow::datatypes::TimeUnit::Nanosecond,
Some("+10:00".into()),
),
ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("+10:00".into())),
false,
),
Field::new("int16_to_int32", ArrowDataType::Int32, false),
]);

let supplied_schema = Arc::new(Schema::new(vec![
Field::new(
"int32_to_ts_second",
ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Second, Some("+01:00".into())),
ArrowDataType::Timestamp(TimeUnit::Second, Some("+01:00".into())),
false,
),
Field::new("date32_to_date64", ArrowDataType::Date64, false),
Field::new("int8_to_int64", ArrowDataType::Int64, false),
Field::new(
"nested",
ArrowDataType::Struct(supplied_nested_fields),
Expand All @@ -3249,7 +3255,7 @@ mod tests {

assert_eq!(arrow_reader.schema(), supplied_schema);
let batch = arrow_reader.next().unwrap().unwrap();
assert_eq!(batch.num_columns(), 3);
assert_eq!(batch.num_columns(), 4);
assert_eq!(batch.num_rows(), 4);
assert_eq!(
batch
Expand All @@ -3273,9 +3279,17 @@ mod tests {
.expect("value as date"),
"1970-01-01"
);
assert_eq!(
batch
.column(2)
.as_any()
.downcast_ref::<Int64Array>()
.expect("downcast to int64"),
&Int64Array::from(vec![0, 1, 2, 3]),
);

let nested = batch
.column(2)
.column(3)
.as_any()
.downcast_ref::<StructArray>()
.expect("downcast to struct");
Expand Down Expand Up @@ -3313,6 +3327,15 @@ mod tests {
.expect("value as datetime"),
"1970-01-01 10:00:00.000000001 +10:00"
);

assert_eq!(
nested
.column(2)
.as_any()
.downcast_ref::<Int32Array>()
.expect("downcast to int64"),
&Int32Array::from(vec![1, 2, 3, 4]),
);
}

#[test]
Expand Down
5 changes: 5 additions & 0 deletions parquet/src/arrow/schema/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType {
(DataType::Int32, DataType::Time32(_)) => hint,
(DataType::Int64, DataType::Time64(_)) => hint,

// Allow up-casting integers (i.e. no precision loss)
(DataType::Int8, DataType::Int16) => hint,
(DataType::Int8 | DataType::Int16, DataType::Int32) => hint,
(DataType::Int8 | DataType::Int16 | DataType::Int32, DataType::Int64) => hint,

// Date64 doesn't have a corresponding LogicalType / ConvertedType
(DataType::Int64, DataType::Date64) => hint,

Expand Down

0 comments on commit 4d39589

Please sign in to comment.