From fdc008d69f89b1acdadc5c447145cbc9f47502f7 Mon Sep 17 00:00:00 2001 From: Ben Wiederhake Date: Mon, 27 Nov 2023 01:25:03 +0100 Subject: [PATCH 1/2] Extract read_exactly as reusable utility function --- include/osmium/io/detail/pbf_input_format.hpp | 27 ++----------------- include/osmium/io/detail/read_write.hpp | 24 +++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/osmium/io/detail/pbf_input_format.hpp b/include/osmium/io/detail/pbf_input_format.hpp index 0f106008..9595d52e 100644 --- a/include/osmium/io/detail/pbf_input_format.hpp +++ b/include/osmium/io/detail/pbf_input_format.hpp @@ -114,29 +114,6 @@ namespace osmium { return size; } - /** - * Read exactly size bytes from fd into buffer. - * - * @pre Value in size parameter must fit in unsigned int - * @returns true if size bytes could be read - * false if EOF was encountered - */ - bool read_exactly(char* buffer, std::size_t size) { - std::size_t to_read = size; - - while (to_read > 0) { - auto const read_size = osmium::io::detail::reliable_read(m_fd, buffer + (size - to_read), static_cast(to_read)); - if (read_size == 0) { // EOF - return false; - } - to_read -= read_size; - } - - *m_offset_ptr += size; - - return true; - } - /** * Read 4 bytes in network byte order from file. They contain * the length of the following BlobHeader. @@ -144,7 +121,7 @@ namespace osmium { uint32_t read_blob_header_size_from_file() { if (m_fd != -1) { std::array buffer{}; - if (!read_exactly(buffer.data(), buffer.size())) { + if (!osmium::io::detail::read_exactly(m_fd, buffer.data(), static_cast(buffer.size()))) { return 0; // EOF } return check_size(get_size_in_network_byte_order(buffer.data())); @@ -230,7 +207,7 @@ namespace osmium { if (m_fd != -1) { buffer.resize(size); - if (!read_exactly(&*buffer.begin(), size)) { + if (!osmium::io::detail::read_exactly(m_fd, &*buffer.begin(), static_cast(size))) { throw osmium::pbf_error{"unexpected EOF"}; } } else { diff --git a/include/osmium/io/detail/read_write.hpp b/include/osmium/io/detail/read_write.hpp index bde056b6..94267239 100644 --- a/include/osmium/io/detail/read_write.hpp +++ b/include/osmium/io/detail/read_write.hpp @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. #include #include +#include #include #include #include @@ -201,6 +202,29 @@ namespace osmium { return nread; } + /** + * Read exactly size bytes from fd into buffer. In contrast to reliable_read, + * this function will continue reading until either EOF or an error is encountered. + * + * @pre buffer Buffer for data to be read. Must be at least size bytes long. + * @returns true if size bytes could be read + * false if EOF was encountered + */ + inline bool read_exactly(int fd, char* buffer, unsigned int size) { + unsigned int to_read = size; + + while (to_read > 0) { + auto const read_size = reliable_read(fd, buffer + (size - to_read), to_read); + if (read_size == 0) { // EOF + return false; + } + assert(read_size <= to_read); + to_read -= read_size; + } + + return true; + } + inline void reliable_fsync(const int fd) { #ifdef _MSC_VER osmium::detail::disable_invalid_parameter_handler diph; From 058af3e8c7f967ebc5ef6761c7524d1aa9852018 Mon Sep 17 00:00:00 2001 From: Ben Wiederhake Date: Mon, 27 Nov 2023 01:31:45 +0100 Subject: [PATCH 2/2] Implement and test util::file_seek --- include/osmium/util/file.hpp | 16 ++++++++++ test/CMakeLists.txt | 1 + test/t/io/test_file_seek.cpp | 61 ++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 test/t/io/test_file_seek.cpp diff --git a/include/osmium/util/file.hpp b/include/osmium/util/file.hpp index 23c53db8..4b1450a8 100644 --- a/include/osmium/util/file.hpp +++ b/include/osmium/util/file.hpp @@ -222,6 +222,22 @@ namespace osmium { return static_cast(offset); } + /** + * Set current offset into file. + * + * @param fd Open file descriptor. + * @param offset Desired absolute offset into the file + */ + inline void file_seek(int fd, size_t offset) noexcept { +#ifdef _MSC_VER + osmium::detail::disable_invalid_parameter_handler diph; + // https://msdn.microsoft.com/en-us/library/1yee101t.aspx + _lseeki64(fd, static_cast<__int64>(offset), SEEK_SET); +#else + ::lseek(fd, offset, SEEK_SET); +#endif + } + /** * Check whether the file descriptor refers to a TTY. * diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f5c696d2..6eb6b811 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -187,6 +187,7 @@ add_unit_test(io test_compression_factory) add_unit_test(io test_file_formats) add_unit_test(io test_nocompression) add_unit_test(io test_output_utils) +add_unit_test(io test_file_seek) add_unit_test(io test_string_table) add_unit_test(io test_bzip2 ENABLE_IF ${BZIP2_FOUND} LIBS ${BZIP2_LIBRARIES}) diff --git a/test/t/io/test_file_seek.cpp b/test/t/io/test_file_seek.cpp new file mode 100644 index 00000000..63a02b32 --- /dev/null +++ b/test/t/io/test_file_seek.cpp @@ -0,0 +1,61 @@ +#include "catch.hpp" + +#include "utils.hpp" + +#include +#include + +/** + * Can read and seek around in files. + */ +TEST_CASE("Seek and read in files") { + /* gzipped data contains very few repetitions in the binary file format, + * which makes it easy to identify any problems. */ + int fd = osmium::io::detail::open_for_reading(with_data_dir("t/io/data.osm.gz")); + struct seek_expectation { + size_t offset; + unsigned char eight_bytes[8]; + }; + const seek_expectation expectations[] = { + { 0x00, {0x1f, 0x8b, 0x08, 0x08, 0x19, 0x4a, 0x18, 0x54} }, + { 0x00, {0x1f, 0x8b, 0x08, 0x08, 0x19, 0x4a, 0x18, 0x54} }, /* repeat / jump back */ + { 0x21, {0x56, 0xc6, 0x18, 0xc3, 0xea, 0x6d, 0x4f, 0xe0} }, /* unaligned */ + { 0xb3, {0xcd, 0x0a, 0xe7, 0x8f, 0xde, 0x00, 0x00, 0x00} }, /* close to end */ + { 0x21, {0x56, 0xc6, 0x18, 0xc3, 0xea, 0x6d, 0x4f, 0xe0} }, /* "long" backward jump */ + }; + for (const auto& expect : expectations) { + char actual_eight_bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + osmium::util::file_seek(fd, expect.offset); + bool did_actually_read = osmium::io::detail::read_exactly(fd, &actual_eight_bytes[0], 8); + REQUIRE(did_actually_read); + for (int i = 0; i < 8; ++i) { + REQUIRE(expect.eight_bytes[i] == static_cast(actual_eight_bytes[i])); + } + } +} + +TEST_CASE("Seek close to end of file") { + /* gzipped data contains very few repetitions in the binary file format, + * which makes it easy to identify any problems. */ + int fd = osmium::io::detail::open_for_reading(with_data_dir("t/io/data.osm.gz")); + REQUIRE(osmium::util::file_size(with_data_dir("t/io/data.osm.gz")) == 187); + char actual_eight_bytes[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + osmium::util::file_seek(fd, 186); + auto actually_read = osmium::io::detail::reliable_read(fd, &actual_eight_bytes[0], 8); + REQUIRE(actually_read == 1); + REQUIRE(actual_eight_bytes[0] == 0); + REQUIRE(actual_eight_bytes[1] == 1); +} + +TEST_CASE("Seek to exact end of file") { + /* gzipped data contains very few repetitions in the binary file format, + * which makes it easy to identify any problems. */ + int fd = osmium::io::detail::open_for_reading(with_data_dir("t/io/data.osm.gz")); + REQUIRE(osmium::util::file_size(with_data_dir("t/io/data.osm.gz")) == 187); + char actual_eight_bytes[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + osmium::util::file_seek(fd, 187); + auto actually_read = osmium::io::detail::reliable_read(fd, &actual_eight_bytes[0], 8); + REQUIRE(actually_read == 0); + REQUIRE(actual_eight_bytes[0] == 1); + REQUIRE(actual_eight_bytes[1] == 1); +}