Skip to content

Commit

Permalink
removed encodings, added statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilsinhaparseable committed Feb 3, 2025
1 parent ca87b28 commit 53415ca
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 26 deletions.
96 changes: 96 additions & 0 deletions src/catalog/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,40 @@ use parquet::file::statistics::Statistics;
pub struct BoolType {
pub min: bool,
pub max: bool,
pub distinct_count: u64,
pub null_count: u64,
pub is_max_value_exact: bool,
pub is_min_value_exact: bool,
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Float64Type {
pub min: f64,
pub max: f64,
pub distinct_count: u64,
pub null_count: u64,
pub is_max_value_exact: bool,
pub is_min_value_exact: bool,
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Int64Type {
pub min: i64,
pub max: i64,
pub distinct_count: u64,
pub null_count: u64,
pub is_max_value_exact: bool,
pub is_min_value_exact: bool,
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Utf8Type {
pub min: String,
pub max: String,
pub distinct_count: u64,
pub null_count: u64,
pub is_max_value_exact: bool,
pub is_min_value_exact: bool,
}

// Typed statistics are typed variant of statistics
Expand All @@ -64,24 +80,40 @@ impl TypedStatistics {
TypedStatistics::Bool(BoolType {
min: min(this.min, other.min),
max: max(this.max, other.max),
distinct_count: this.distinct_count + other.distinct_count,
null_count: this.null_count + other.null_count,
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
})
}
(TypedStatistics::Float(this), TypedStatistics::Float(other)) => {
TypedStatistics::Float(Float64Type {
min: this.min.min(other.min),
max: this.max.max(other.max),
distinct_count: this.distinct_count + other.distinct_count,
null_count: this.null_count + other.null_count,
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
})
}
(TypedStatistics::Int(this), TypedStatistics::Int(other)) => {
TypedStatistics::Int(Int64Type {
min: min(this.min, other.min),
max: max(this.max, other.max),
distinct_count: this.distinct_count + other.distinct_count,
null_count: this.null_count + other.null_count,
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
})
}
(TypedStatistics::String(this), TypedStatistics::String(other)) => {
TypedStatistics::String(Utf8Type {
min: min(this.min, other.min),
max: max(this.max, other.max),
distinct_count: this.distinct_count + other.distinct_count,
null_count: this.null_count + other.null_count,
is_max_value_exact: this.is_max_value_exact && other.is_max_value_exact,
is_min_value_exact: this.is_min_value_exact && other.is_min_value_exact,
})
}
_ => panic!("Cannot update wrong types"),
Expand Down Expand Up @@ -146,26 +178,74 @@ impl TryFrom<&Statistics> for TypedStatistics {
Statistics::Boolean(stats) => TypedStatistics::Bool(BoolType {
min: *stats.min_opt().expect("Boolean stats min not set"),
max: *stats.max_opt().expect("Boolean stats max not set"),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::Int32(stats) => TypedStatistics::Int(Int64Type {
min: *stats.min_opt().expect("Int32 stats min not set") as i64,
max: *stats.max_opt().expect("Int32 stats max not set") as i64,
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::Int64(stats) => TypedStatistics::Int(Int64Type {
min: *stats.min_opt().expect("Int64 stats min not set"),
max: *stats.max_opt().expect("Int64 stats max not set"),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::Int96(stats) => TypedStatistics::Int(Int64Type {
min: stats.min_opt().expect("Int96 stats min not set").to_i64(),
max: stats.max_opt().expect("Int96 stats max not set").to_i64(),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::Float(stats) => TypedStatistics::Float(Float64Type {
min: *stats.min_opt().expect("Float32 stats min not set") as f64,
max: *stats.max_opt().expect("Float32 stats max not set") as f64,
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::Double(stats) => TypedStatistics::Float(Float64Type {
min: *stats.min_opt().expect("Float64 stats min not set"),
max: *stats.max_opt().expect("Float64 stats max not set"),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::ByteArray(stats) => TypedStatistics::String(Utf8Type {
min: stats
Expand All @@ -178,6 +258,14 @@ impl TryFrom<&Statistics> for TypedStatistics {
.expect("Utf8 stats max not set")
.as_utf8()?
.to_owned(),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
Statistics::FixedLenByteArray(stats) => TypedStatistics::String(Utf8Type {
min: stats
Expand All @@ -190,6 +278,14 @@ impl TryFrom<&Statistics> for TypedStatistics {
.expect("Utf8 stats max not set")
.as_utf8()?
.to_owned(),
distinct_count: stats
.distinct_count()
.expect("Boolean stats distinct count not set"),
null_count: stats
.null_count_opt()
.expect("Boolean stats null count not set"),
is_max_value_exact: stats.max_is_exact(),
is_min_value_exact: stats.min_is_exact(),
}),
};

Expand Down
27 changes: 1 addition & 26 deletions src/storage/staging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ use parquet::{
arrow::ArrowWriter,
basic::Encoding,
errors::ParquetError,
file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder},
file::properties::{WriterProperties, WriterPropertiesBuilder},
format::SortingColumn,
schema::types::ColumnPath,
};
Expand Down Expand Up @@ -339,31 +339,6 @@ pub fn parquet_writer_props(
sorting_column_vec.push(sorting_column);
}

props = props
.set_dictionary_enabled(true)
.set_encoding(Encoding::PLAIN)
.set_statistics_enabled(EnabledStatistics::Chunk);

let url_column = ColumnPath::new(vec!["URL".to_string()]);
props = props
.set_column_dictionary_enabled(url_column.clone(), true)
.set_column_encoding(url_column.clone(), Encoding::DELTA_BYTE_ARRAY)
.set_column_statistics_enabled(url_column.clone(), EnabledStatistics::Chunk);

let event_time_column = ColumnPath::new(vec!["EventTime".to_string()]);
props = props
.set_column_encoding(event_time_column.clone(), Encoding::DELTA_BINARY_PACKED)
.set_column_statistics_enabled(event_time_column.clone(), EnabledStatistics::Chunk);

let user_id_column = ColumnPath::new(vec!["UserID".to_string()]);
props = props
.set_column_encoding(user_id_column.clone(), Encoding::DELTA_BINARY_PACKED)
.set_column_statistics_enabled(user_id_column.clone(), EnabledStatistics::Chunk);

let search_phrase_column = ColumnPath::new(vec!["SearchPhrase".to_string()]);
props = props
.set_column_encoding(search_phrase_column.clone(), Encoding::DELTA_BYTE_ARRAY)
.set_column_statistics_enabled(search_phrase_column.clone(), EnabledStatistics::Chunk);
props = props.set_sorting_columns(Some(sorting_column_vec));
props
}
Expand Down

0 comments on commit 53415ca

Please sign in to comment.