Skip to content

Commit

Permalink
fix: Data type inference for NaN, inf and -inf in csv files (#7150)
Browse files Browse the repository at this point in the history
* fix: Data type inference for NaN, inf and -inf in csv files

* Adds tests for NaN, inf and -inf Float64 values

* Adds python-style NaN
  • Loading branch information
Mottl authored Feb 19, 2025
1 parent 46afdd8 commit 7b057e1
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
10 changes: 8 additions & 2 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ impl InferredDataType {
} else {
1 << m
}
} else if string == "NaN" || string == "nan" || string == "inf" || string == "-inf" {
1 << 2 // Float64
} else {
1 << 8 // Utf8
}
Expand Down Expand Up @@ -1659,7 +1661,7 @@ mod tests {
let mut csv = builder.build(file).unwrap();
let batch = csv.next().unwrap().unwrap();

assert_eq!(7, batch.num_rows());
assert_eq!(10, batch.num_rows());
assert_eq!(6, batch.num_columns());

let schema = batch.schema();
Expand Down Expand Up @@ -1803,6 +1805,10 @@ mod tests {
assert_eq!(infer_field_schema("10.2"), DataType::Float64);
assert_eq!(infer_field_schema(".2"), DataType::Float64);
assert_eq!(infer_field_schema("2."), DataType::Float64);
assert_eq!(infer_field_schema("NaN"), DataType::Float64);
assert_eq!(infer_field_schema("nan"), DataType::Float64);
assert_eq!(infer_field_schema("inf"), DataType::Float64);
assert_eq!(infer_field_schema("-inf"), DataType::Float64);
assert_eq!(infer_field_schema("true"), DataType::Boolean);
assert_eq!(infer_field_schema("trUe"), DataType::Boolean);
assert_eq!(infer_field_schema("false"), DataType::Boolean);
Expand Down Expand Up @@ -2372,7 +2378,7 @@ mod tests {
fn test_buffered() {
let tests = [
("test/data/uk_cities.csv", false, 37),
("test/data/various_types.csv", true, 7),
("test/data/various_types.csv", true, 10),
("test/data/decimal_test.csv", false, 10),
];

Expand Down
5 changes: 4 additions & 1 deletion arrow-csv/test/data/various_types.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ c_int|c_float|c_string|c_bool|c_date|c_datetime
4|4.4||false||
5|6.6|""|false|1990-01-01|1990-01-01T03:00:00
4|4e6||false||
4|4.0e-6||false||
4|4.0e-6||false||
6|NaN||false||
7|inf||false||
8|-inf||false||

0 comments on commit 7b057e1

Please sign in to comment.