Skip to content

Commit

Permalink
PARQUET-1494: [C++] Recognize statistics built with UNSIGNED sort ord…
Browse files Browse the repository at this point in the history
…er by parquet-mr 1.10.0 onwards

Fixes the issue with min-max statistics built by parquet-mr 1.10+:
https://issues.apache.org/jira/browse/PARQUET-1494

Author: Ildar Musin <[email protected]>

Closes apache#3441 from zilder/fix_parquet_1494 and squashes the following commits:

c5ba5fc <Ildar Musin> Test case for min-max statistics on binary column built with parquet-mr
029f730 <Ildar Musin> PARQUET-1494: Recognize statistics built with UNSIGNED sort order by parquet-mr 1.10.0 onwards
  • Loading branch information
zilder authored and wesm committed Jan 23, 2019
1 parent d15cb45 commit 085034f
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 3 deletions.
11 changes: 9 additions & 2 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION()
return version;
}

const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
static ApplicationVersion version("parquet-mr", 1, 10, 0);
return version;
}

std::string ParquetVersionToString(ParquetVersion::type ver) {
switch (ver) {
case ParquetVersion::PARQUET_1_0:
Expand Down Expand Up @@ -547,8 +552,10 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons
bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
EncodedStatistics& statistics,
SortOrder::type sort_order) const {
// Parquet cpp version 1.3.0 onwards stats are computed correctly for all types
if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
// parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
// correctly for all types
if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
(application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
// Only SIGNED are valid unless max and min are the same
// (in which case the sort order does not matter)
bool max_equals_min = statistics.has_min && statistics.has_max
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class PARQUET_EXPORT ApplicationVersion {
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
// Regular expression for the version format
// major . minor . patch unknown - prerelease.x + build info
// Eg: 1.5.0ab-cdh5.5.0+cd
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/parquet/statistics-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -772,5 +772,33 @@ TEST(TestStatisticsDoubleNaN, NaNValues) {
ASSERT_EQ(min, -3.0);
ASSERT_EQ(max, 4.0);
}

// Test statistics for binary column with UNSIGNED sort order
TEST(TestStatisticsMinMax, Unsigned) {
std::string dir_string(test::get_data_dir());
std::stringstream ss;
ss << dir_string << "/binary.parquet";
auto path = ss.str();

// The file is generated by parquet-mr 1.10.0, the first version that
// supports correct statistics for binary data (see PARQUET-1025). It
// contains a single column of binary type. Data is just single byte values
// from 0x00 to 0x0B.
auto file_reader = ParquetFileReader::OpenFile(path);
auto rg_reader = file_reader->RowGroup(0);
auto metadata = rg_reader->metadata();
auto column_schema = metadata->schema()->Column(0);
ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order());

auto column_chunk = metadata->ColumnChunk(0);
ASSERT_TRUE(column_chunk->is_stats_set());

std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
ASSERT_TRUE(stats != NULL);
ASSERT_EQ(0, stats->null_count());
ASSERT_EQ(12, stats->num_values());
ASSERT_EQ(0x00, stats->EncodeMin()[0]);
ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
}
} // namespace test
} // namespace parquet
2 changes: 1 addition & 1 deletion cpp/submodules/parquet-testing

0 comments on commit 085034f

Please sign in to comment.