-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Signature::Coercible with user defined implicit casting #14440
base: main
Are you sure you want to change the base?
Changes from 24 commits
386c7ed
579ffef
104da43
aae48ff
f585136
bad7348
c99e986
07f97d0
da84394
e0a889e
7a78a6d
2f8c2ad
f02bf6e
8054915
62da381
231d75b
44250fc
2f6b5d6
5387e5e
181f2ab
d98e4b6
cb30d91
2cd8f5f
6ce159f
a3120f7
bde60f6
5d47a62
59d06d8
ff5d063
77fe4b8
7dc7365
6ce2c8c
56838b7
6ceb9a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -19,11 +19,13 @@ | |||||
//! and return types of functions in DataFusion. | ||||||
use std::fmt::Display; | ||||||
use std::hash::Hash; | ||||||
use std::num::NonZeroUsize; | ||||||
|
||||||
use crate::type_coercion::aggregates::NUMERICS; | ||||||
use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; | ||||||
use datafusion_common::types::{LogicalTypeRef, NativeType}; | ||||||
use indexmap::IndexSet; | ||||||
use itertools::Itertools; | ||||||
|
||||||
/// Constant that is used as a placeholder for any valid timezone. | ||||||
|
@@ -127,12 +129,11 @@ pub enum TypeSignature { | |||||
Exact(Vec<DataType>), | ||||||
/// One or more arguments belonging to the [`TypeSignatureClass`], in order. | ||||||
/// | ||||||
/// For example, `Coercible(vec![logical_float64()])` accepts | ||||||
/// arguments like `vec![Int32]` or `vec![Float32]` | ||||||
/// since i32 and f32 can be cast to f64 | ||||||
/// [`Coercion`] contains not only the desired type but also the allowed casts. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this API makes a lot of sense to me-- in fact I think it is pretty close to being able to express most other signatures. |
||||||
/// For example, if you expect a function has string type, but you also allow it to be casted from binary type. | ||||||
/// | ||||||
/// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`]. | ||||||
Coercible(Vec<TypeSignatureClass>), | ||||||
Coercible(Vec<Coercion>), | ||||||
/// One or more arguments coercible to a single, comparable type. | ||||||
/// | ||||||
/// Each argument will be coerced to a single type using the | ||||||
|
@@ -209,14 +210,13 @@ impl TypeSignature { | |||||
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] | ||||||
pub enum TypeSignatureClass { | ||||||
Timestamp, | ||||||
Date, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can use NativeType::Date instead, not need |
||||||
Time, | ||||||
Interval, | ||||||
Duration, | ||||||
Native(LogicalTypeRef), | ||||||
// TODO: | ||||||
// Numeric | ||||||
// Integer | ||||||
Integer, | ||||||
} | ||||||
|
||||||
impl Display for TypeSignatureClass { | ||||||
|
@@ -310,8 +310,8 @@ impl TypeSignature { | |||||
TypeSignature::Comparable(num) => { | ||||||
vec![format!("Comparable({num})")] | ||||||
} | ||||||
TypeSignature::Coercible(types) => { | ||||||
vec![Self::join_types(types, ", ")] | ||||||
TypeSignature::Coercible(coercions) => { | ||||||
vec![Self::join_types(coercions, ", ")] | ||||||
} | ||||||
TypeSignature::Exact(types) => { | ||||||
vec![Self::join_types(types, ", ")] | ||||||
|
@@ -365,7 +365,12 @@ impl TypeSignature { | |||||
} | ||||||
} | ||||||
|
||||||
/// get all possible types for the given `TypeSignature` | ||||||
/// This function is used specifically internally for `information_schema` | ||||||
/// We suggest not to rely on this function | ||||||
/// | ||||||
/// Get all possible types for `information_schema` from the given `TypeSignature` | ||||||
// | ||||||
// TODO: Make this function private | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could potentially mark it as deprecated and make the new (non pub) version with a different name Maybe something like this (can do in a follow on PR) pub fn get_example_types(&self) -> Vec<Vec<DataType>> { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we would need NativeType for |
||||||
pub fn get_possible_types(&self) -> Vec<Vec<DataType>> { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice comment, can we rename the function too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is public function 😮 |
||||||
match self { | ||||||
TypeSignature::Exact(types) => vec![types.clone()], | ||||||
|
@@ -378,31 +383,24 @@ impl TypeSignature { | |||||
.cloned() | ||||||
.map(|data_type| vec![data_type; *arg_count]) | ||||||
.collect(), | ||||||
TypeSignature::Coercible(types) => types | ||||||
TypeSignature::Coercible(coercions) => coercions | ||||||
.iter() | ||||||
.map(|logical_type| match logical_type { | ||||||
TypeSignatureClass::Native(l) => get_data_types(l.native()), | ||||||
TypeSignatureClass::Timestamp => { | ||||||
vec![ | ||||||
DataType::Timestamp(TimeUnit::Nanosecond, None), | ||||||
DataType::Timestamp( | ||||||
TimeUnit::Nanosecond, | ||||||
Some(TIMEZONE_WILDCARD.into()), | ||||||
), | ||||||
] | ||||||
} | ||||||
TypeSignatureClass::Date => { | ||||||
vec![DataType::Date64] | ||||||
} | ||||||
TypeSignatureClass::Time => { | ||||||
vec![DataType::Time64(TimeUnit::Nanosecond)] | ||||||
} | ||||||
TypeSignatureClass::Interval => { | ||||||
vec![DataType::Interval(IntervalUnit::DayTime)] | ||||||
} | ||||||
TypeSignatureClass::Duration => { | ||||||
vec![DataType::Duration(TimeUnit::Nanosecond)] | ||||||
.map(|c| { | ||||||
let mut all_types: IndexSet<DataType> = | ||||||
get_possible_types_from_signature_classes(&c.desired_type) | ||||||
.into_iter() | ||||||
.collect(); | ||||||
|
||||||
if let Some(implicit_coercion) = &c.implicit_coercion { | ||||||
let allowed_casts: Vec<DataType> = implicit_coercion | ||||||
.allowed_source_types | ||||||
.iter() | ||||||
.flat_map(get_possible_types_from_signature_classes) | ||||||
.collect(); | ||||||
all_types.extend(allowed_casts); | ||||||
} | ||||||
|
||||||
all_types.into_iter().collect::<Vec<_>>() | ||||||
}) | ||||||
.multi_cartesian_product() | ||||||
.collect(), | ||||||
|
@@ -431,6 +429,32 @@ impl TypeSignature { | |||||
} | ||||||
} | ||||||
|
||||||
fn get_possible_types_from_signature_classes( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would make more sense to me as a method on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is used in @goldmedal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@jayzhan211 I'm moving the conversation to this thread to consolidate. I totally understand your concern, here are my thoughts:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with @shehabgamin on both points There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested name:
Suggested change
And we can put it on TypeSignatureClass |
||||||
signature_classes: &TypeSignatureClass, | ||||||
) -> Vec<DataType> { | ||||||
match signature_classes { | ||||||
TypeSignatureClass::Native(l) => get_data_types(l.native()), | ||||||
TypeSignatureClass::Timestamp => { | ||||||
vec![ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The rest of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is "simplified" because of some reasons, if we add all possible types, the combination will be huge. This function is used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think this is helpful to be honest -- just the combination of classes |
||||||
DataType::Timestamp(TimeUnit::Nanosecond, None), | ||||||
DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), | ||||||
] | ||||||
} | ||||||
TypeSignatureClass::Time => { | ||||||
vec![DataType::Time64(TimeUnit::Nanosecond)] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is ok to have one example type as this is only used for inforamtin schema -- the naming is super confusing though -- I think renaming the functions would make it much better |
||||||
} | ||||||
TypeSignatureClass::Interval => { | ||||||
vec![DataType::Interval(IntervalUnit::DayTime)] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible |
||||||
} | ||||||
TypeSignatureClass::Duration => { | ||||||
vec![DataType::Duration(TimeUnit::Nanosecond)] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible |
||||||
} | ||||||
TypeSignatureClass::Integer => { | ||||||
vec![DataType::Int64] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible ints. |
||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
fn get_data_types(native_type: &NativeType) -> Vec<DataType> { | ||||||
match native_type { | ||||||
NativeType::Null => vec![DataType::Null], | ||||||
|
@@ -460,6 +484,110 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> { | |||||
} | ||||||
} | ||||||
|
||||||
#[derive(Debug, Clone, Eq, PartialOrd)] | ||||||
pub struct Coercion { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like the idea of Coercion. Can this idea be used for user defined coercions or coercions to specific (Arrow) types? Also, is there any way to allow users to provide their own coercion rules? For example, if Sail / @shehabgamin wants to support converting numeric values to strings automatically, would he be express that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 this would be really great to have! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to ascii that binary to string is supported, you add numeric types in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jayzhan211 , I believe @alamb 's question (please correct me if I'm wrong) is about creating functionality for a downstream user to override the default signature of a UDF in order to provide their own coercion rules. For example, something like this:
This is the conversation we were having here as well: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is completely another issue 😅 , before that we need to improve existing function signature provide their own coercion rules not own signature There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is what I was getting at What do you think about making #[derive(Debug, Clone, Eq, PartialOrd)]
pub enum Coercion {
/// the argument type must match desired_type
Exact {
desired_type: TypeSignatureClass,
}
/// The argument type must be coercable to the desired type using the implicit coercion
Implict {
desired_type: TypeSignatureClass,
implicit_coercion: ImplicitCoercion,
}
} Then we can eventually maybe add a "user defined coercion" variant Let me make a PR to see what this would look like. |
||||||
pub desired_type: TypeSignatureClass, | ||||||
implicit_coercion: Option<ImplicitCoercion>, | ||||||
} | ||||||
|
||||||
impl Coercion { | ||||||
pub fn new(desired_type: TypeSignatureClass) -> Self { | ||||||
Self { | ||||||
desired_type, | ||||||
implicit_coercion: None, | ||||||
} | ||||||
} | ||||||
|
||||||
/// Create a new coercion with implicit coercion rules. | ||||||
/// | ||||||
/// `allowed_source_types` defines the possible types that can be coerced to `desired_type`. | ||||||
/// `default_casted_type` is the default type to be used for coercion if we cast from other types via `allowed_source_types`. | ||||||
pub fn new_with_implicit_coercion( | ||||||
desired_type: TypeSignatureClass, | ||||||
allowed_source_types: Vec<TypeSignatureClass>, | ||||||
default_casted_type: NativeType, | ||||||
) -> Self { | ||||||
Self { | ||||||
desired_type, | ||||||
implicit_coercion: Some(ImplicitCoercion { | ||||||
allowed_source_types, | ||||||
default_casted_type, | ||||||
}), | ||||||
} | ||||||
} | ||||||
|
||||||
pub fn allowed_source_types(&self) -> &[TypeSignatureClass] { | ||||||
self.implicit_coercion | ||||||
.as_ref() | ||||||
.map(|c| c.allowed_source_types.as_slice()) | ||||||
.unwrap_or_default() | ||||||
} | ||||||
|
||||||
pub fn default_casted_type(&self) -> Option<&NativeType> { | ||||||
self.implicit_coercion | ||||||
.as_ref() | ||||||
.map(|c| &c.default_casted_type) | ||||||
} | ||||||
} | ||||||
|
||||||
impl Display for Coercion { | ||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||||
write!(f, "Coercion({}", self.desired_type)?; | ||||||
if let Some(implicit_coercion) = &self.implicit_coercion { | ||||||
write!(f, ", implicit_coercion={implicit_coercion}",) | ||||||
} else { | ||||||
write!(f, ")") | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
impl PartialEq for Coercion { | ||||||
fn eq(&self, other: &Self) -> bool { | ||||||
self.desired_type == other.desired_type | ||||||
&& self.implicit_coercion == other.implicit_coercion | ||||||
} | ||||||
} | ||||||
|
||||||
impl Hash for Coercion { | ||||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) { | ||||||
self.desired_type.hash(state); | ||||||
self.implicit_coercion.hash(state); | ||||||
} | ||||||
} | ||||||
|
||||||
#[derive(Debug, Clone, Eq, PartialOrd)] | ||||||
pub struct ImplicitCoercion { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also add some doc for it. Maybe adding some examples here is good. I was confused about what it be used for before I did some tests. |
||||||
allowed_source_types: Vec<TypeSignatureClass>, | ||||||
/// For types like Timestamp, there are multiple possible timeunit and timezone from a given TypeSignatureClass | ||||||
/// We need to specify the default type to be used for coercion if we cast from other types via `allowed_source_types` | ||||||
/// Other types like Int64, you don't need to specify this field since there is only one possible type. | ||||||
default_casted_type: NativeType, | ||||||
} | ||||||
|
||||||
impl Display for ImplicitCoercion { | ||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||||
write!( | ||||||
f, | ||||||
"ImplicitCoercion({:?}, default_type={:?})", | ||||||
self.allowed_source_types, self.default_casted_type | ||||||
) | ||||||
} | ||||||
} | ||||||
|
||||||
impl PartialEq for ImplicitCoercion { | ||||||
fn eq(&self, other: &Self) -> bool { | ||||||
self.allowed_source_types == other.allowed_source_types | ||||||
&& self.default_casted_type == other.default_casted_type | ||||||
} | ||||||
} | ||||||
|
||||||
impl Hash for ImplicitCoercion { | ||||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) { | ||||||
self.allowed_source_types.hash(state); | ||||||
self.default_casted_type.hash(state); | ||||||
} | ||||||
} | ||||||
|
||||||
/// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function. | ||||||
/// | ||||||
/// DataFusion will automatically coerce (cast) argument types to one of the supported | ||||||
|
@@ -536,11 +664,9 @@ impl Signature { | |||||
volatility, | ||||||
} | ||||||
} | ||||||
|
||||||
/// Target coerce types in order | ||||||
pub fn coercible( | ||||||
target_types: Vec<TypeSignatureClass>, | ||||||
volatility: Volatility, | ||||||
) -> Self { | ||||||
pub fn coercible(target_types: Vec<Coercion>, volatility: Volatility) -> Self { | ||||||
Self { | ||||||
type_signature: TypeSignature::Coercible(target_types), | ||||||
volatility, | ||||||
|
@@ -739,8 +865,8 @@ mod tests { | |||||
); | ||||||
|
||||||
let type_signature = TypeSignature::Coercible(vec![ | ||||||
TypeSignatureClass::Native(logical_string()), | ||||||
TypeSignatureClass::Native(logical_int64()), | ||||||
Coercion::new(TypeSignatureClass::Native(logical_string())), | ||||||
Coercion::new(TypeSignatureClass::Native(logical_int64())), | ||||||
]); | ||||||
let possible_types = type_signature.get_possible_types(); | ||||||
assert_eq!( | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
😍