Skip to content

Commit

Permalink
apacheGH-40633: [C++][Python] Fix ORC crash when file contains unknow…
Browse files Browse the repository at this point in the history
…n timezone (apache#45051)

### Rationale for this change

If the timezone database is present on the system, but does not contain a timezone referenced in a ORC file, the ORC reader will crash with an uncaught C++ exception.

This can happen for example on Ubuntu 24.04 where some timezone aliases have been removed from the main `tzdata` package to a `tzdata-legacy` package. If `tzdata-legacy` is not installed, trying to read a ORC file that references e.g. the "US/Pacific" timezone would crash.

Here is a backtrace excerpt:
```
#12 0x00007f1a3ce23a55 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#13 0x00007f1a3ce39391 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
apache#14 0x00007f1a3f4accc4 in orc::loadTZDB(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#15 0x00007f1a3f4ad392 in std::call_once<orc::LazyTimezone::getImpl() const::{lambda()#1}>(std::once_flag&, orc::LazyTimezone::getImpl() const::{lambda()#1}&&)::{lambda()#2}::_FUN() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#16 0x00007f1a4298bec3 in __pthread_once_slow (once_control=0xa5ca7c8, init_routine=0x7f1a3ce69420 <__once_proxy>) at ./nptl/pthread_once.c:116
apache#17 0x00007f1a3f4a9ad0 in orc::LazyTimezone::getEpoch() const ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#18 0x00007f1a3f4e76b1 in orc::TimestampColumnReader::TimestampColumnReader(orc::Type const&, orc::StripeStreams&, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#19 0x00007f1a3f4e84ad in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#20 0x00007f1a3f4e8dd7 in orc::StructColumnReader::StructColumnReader(orc::Type const&, orc::StripeStreams&, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#21 0x00007f1a3f4e8532 in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#22 0x00007f1a3f4925e9 in orc::RowReaderImpl::startNextStripe() ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#23 0x00007f1a3f492c9d in orc::RowReaderImpl::next(orc::ColumnVectorBatch&) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
apache#24 0x00007f1a3e6b251f in arrow::adapters::orc::ORCFileReader::Impl::ReadBatch(orc::RowReaderOptions const&, std::shared_ptr<arrow::Schema> const&, long) ()
   from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900
```

### What changes are included in this PR?

Catch C++ exceptions when iterating ORC batches instead of letting them slip through.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: apache#40633

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
pitrou authored Dec 18, 2024
1 parent 5ec8b64 commit 1e5d6e5
Showing 2 changed files with 65 additions and 9 deletions.
16 changes: 8 additions & 8 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
@@ -145,28 +145,29 @@ class OrcStripeReader : public RecordBatchReader {

Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
std::unique_ptr<liborc::ColumnVectorBatch> batch;
ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
std::unique_ptr<RecordBatchBuilder> builder;

ORC_BEGIN_CATCH_NOT_OK
batch = row_reader_->createRowBatch(batch_size_);

const liborc::Type& type = row_reader_->getSelectedType();
if (!row_reader_->next(*batch)) {
out->reset();
return Status::OK();
}

std::unique_ptr<RecordBatchBuilder> builder;
ARROW_ASSIGN_OR_RAISE(builder,
RecordBatchBuilder::Make(schema_, pool_, batch->numElements));

// The top-level type must be a struct to read into an arrow table
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);

for (int i = 0; i < builder->num_fields(); i++) {
RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
batch->numElements, builder->GetField(i)));
}
ORC_END_CATCH_NOT_OK

ARROW_ASSIGN_OR_RAISE(*out, builder->Flush());
return Status::OK();
return builder->Flush().Value(out);
}

private:
@@ -470,15 +471,13 @@ class ORCFileReader::Impl {
int64_t nrows) {
std::unique_ptr<liborc::RowReader> row_reader;
std::unique_ptr<liborc::ColumnVectorBatch> batch;
std::unique_ptr<RecordBatchBuilder> builder;

ORC_BEGIN_CATCH_NOT_OK
row_reader = reader_->createRowReader(opts);
batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
ORC_END_CATCH_NOT_OK

std::unique_ptr<RecordBatchBuilder> builder;
ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows));

// The top-level type must be a struct to read into an arrow table
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);

@@ -489,6 +488,7 @@ class ORCFileReader::Impl {
batch->numElements, builder->GetField(i)));
}
}
ORC_END_CATCH_NOT_OK

return builder->Flush();
}
58 changes: 57 additions & 1 deletion python/pyarrow/tests/test_orc.py
Original file line number Diff line number Diff line change
@@ -15,9 +15,14 @@
# specific language governing permissions and limitations
# under the License.

import pytest
import decimal
import datetime
from pathlib import Path
import shutil
import subprocess
import sys

import pytest

import pyarrow as pa
from pyarrow import fs
@@ -140,6 +145,57 @@ def test_example_using_json(filename, datadir):
check_example_file(path, table, need_fix=True)


def test_timezone_database_absent(datadir):
# Example file relies on the timezone "US/Pacific". It should gracefully
# fail, not crash, if the timezone database is not found.
path = datadir / 'TestOrcFile.testDate1900.orc'
code = f"""if 1:
import os
os.environ['TZDIR'] = '/tmp/non_existent'
from pyarrow import orc
try:
orc_file = orc.ORCFile({str(path)!r})
orc_file.read()
except Exception as e:
assert "time zone database" in str(e).lower(), e
else:
assert False, "Should have raised exception"
"""
subprocess.run([sys.executable, "-c", code], check=True)


def test_timezone_absent(datadir, tmpdir):
# Example file relies on the timezone "US/Pacific". It should gracefully
# fail, not crash, if the timezone database is present but the timezone
# is not found (GH-40633).
source_tzdir = Path('/usr/share/zoneinfo')
if not source_tzdir.exists():
pytest.skip(f"Test needs timezone database in {source_tzdir}")
tzdir = Path(tmpdir / 'zoneinfo')
try:
shutil.copytree(source_tzdir, tzdir, symlinks=True)
except OSError as e:
pytest.skip(f"Failed to copy timezone database: {e}")
(tzdir / 'US' / 'Pacific').unlink(missing_ok=True)

path = datadir / 'TestOrcFile.testDate1900.orc'
code = f"""if 1:
import os
os.environ['TZDIR'] = {str(tzdir)!r}
from pyarrow import orc
orc_file = orc.ORCFile({str(path)!r})
try:
orc_file.read()
except Exception as e:
assert "zoneinfo/US/Pacific" in str(e), e
else:
assert False, "Should have raised exception"
"""
subprocess.run([sys.executable, "-c", code], check=True)


def test_orcfile_empty(datadir):
from pyarrow import orc

0 comments on commit 1e5d6e5

Please sign in to comment.