Skip to content

Commit

Permalink
parquet: Add row_groups_matched_{statistics,bloom_filter} statistics (#…
Browse files Browse the repository at this point in the history
…9640)

* test_row_group_prune: Display which assertion failed

* Add row_groups_matched_{statistics,bloom_filter} statistics

This helps diagnostic whether a Bloom filter mismatches (because of high
false-positive probability caused by suboptimal tuning) or is not used at all.
  • Loading branch information
progval authored Mar 18, 2024
1 parent 2499245 commit e53eb03
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 5 deletions.
14 changes: 14 additions & 0 deletions datafusion/core/src/datasource/physical_plan/parquet/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@ use crate::physical_plan::metrics::{
pub struct ParquetFileMetrics {
/// Number of times the predicate could not be evaluated
pub predicate_evaluation_errors: Count,
/// Number of row groups whose bloom filters were checked and matched
pub row_groups_matched_bloom_filter: Count,
/// Number of row groups pruned by bloom filters
pub row_groups_pruned_bloom_filter: Count,
/// Number of row groups whose statistics were checked and matched
pub row_groups_matched_statistics: Count,
/// Number of row groups pruned by statistics
pub row_groups_pruned_statistics: Count,
/// Total number of bytes scanned
Expand All @@ -56,10 +60,18 @@ impl ParquetFileMetrics {
.with_new_label("filename", filename.to_string())
.counter("predicate_evaluation_errors", partition);

let row_groups_matched_bloom_filter = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_matched_bloom_filter", partition);

let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_pruned_bloom_filter", partition);

let row_groups_matched_statistics = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_matched_statistics", partition);

let row_groups_pruned_statistics = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_pruned_statistics", partition);
Expand All @@ -85,7 +97,9 @@ impl ParquetFileMetrics {

Self {
predicate_evaluation_errors,
row_groups_matched_bloom_filter,
row_groups_pruned_bloom_filter,
row_groups_matched_statistics,
row_groups_pruned_statistics,
bytes_scanned,
pushdown_rows_filtered,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pub(crate) fn prune_row_groups_by_statistics(
metrics.predicate_evaluation_errors.add(1);
}
}
metrics.row_groups_matched_statistics.add(1);
}

filtered.push(idx)
Expand Down Expand Up @@ -166,6 +167,9 @@ pub(crate) async fn prune_row_groups_by_bloom_filters<
if prune_group {
metrics.row_groups_pruned_bloom_filter.add(1);
} else {
if !stats.column_sbbf.is_empty() {
metrics.row_groups_matched_bloom_filter.add(1);
}
filtered.push(*idx);
}
}
Expand Down
17 changes: 17 additions & 0 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,33 @@ impl TestOutput {
self.metric_value("predicate_evaluation_errors")
}

/// The number of row_groups matched by bloom filter
fn row_groups_matched_bloom_filter(&self) -> Option<usize> {
self.metric_value("row_groups_matched_bloom_filter")
}

/// The number of row_groups pruned by bloom filter
fn row_groups_pruned_bloom_filter(&self) -> Option<usize> {
self.metric_value("row_groups_pruned_bloom_filter")
}

/// The number of row_groups matched by statistics
fn row_groups_matched_statistics(&self) -> Option<usize> {
self.metric_value("row_groups_matched_statistics")
}

/// The number of row_groups pruned by statistics
fn row_groups_pruned_statistics(&self) -> Option<usize> {
self.metric_value("row_groups_pruned_statistics")
}

/// The number of row_groups matched by bloom filter or statistics
fn row_groups_matched(&self) -> Option<usize> {
self.row_groups_matched_bloom_filter()
.zip(self.row_groups_matched_statistics())
.map(|(a, b)| a + b)
}

/// The number of row_groups pruned
fn row_groups_pruned(&self) -> Option<usize> {
self.row_groups_pruned_bloom_filter()
Expand Down
Loading

0 comments on commit e53eb03

Please sign in to comment.