diff --git a/src/compression.jl b/src/compression.jl index 32fb444..b6da111 100644 --- a/src/compression.jl +++ b/src/compression.jl @@ -60,43 +60,55 @@ function TranscodingStreams.finalize(codec::Bzip2Compressor) if codec.stream.state != C_NULL code = compress_end!(codec.stream) if code != BZ_OK - bzerror(codec.stream, code) + bzerror(code) end end return end -function TranscodingStreams.startproc(codec::Bzip2Compressor, ::Symbol, error::Error) +function TranscodingStreams.startproc(codec::Bzip2Compressor, ::Symbol, error_ref::Error) if codec.stream.state != C_NULL code = compress_end!(codec.stream) if code != BZ_OK - error[] = BZ2Error(code) + error_ref[] = BZ2Error(code) return :error end end code = compress_init!(codec.stream, codec.blocksize100k, codec.verbosity, codec.workfactor) - if code != BZ_OK - error[] = BZ2Error(code) - return :error + # errors in compress_init! do not require clean up, so just throw + if code == BZ_OK + return :ok + elseif code == BZ_CONFIG_ERROR + error("BZ_CONFIG_ERROR: libbzip2 has been mis-compiled") + elseif code == BZ_PARAM_ERROR + error("BZ_PARAM_ERROR: this must be checked in Bzip2Compressor constructor") + elseif code == BZ_MEM_ERROR + throw(OutOfMemoryError()) + else + error("unexpected libbzip2 error code: $(code)") end - return :ok end -function TranscodingStreams.process(codec::Bzip2Compressor, input::Memory, output::Memory, error::Error) +function TranscodingStreams.process(codec::Bzip2Compressor, input::Memory, output::Memory, error_ref::Error) stream = codec.stream + if stream.state == C_NULL + error("startproc must be called before process") + end stream.next_in = input.ptr - stream.avail_in = input.size + avail_in = min(input.size, typemax(Cuint)) + stream.avail_in = avail_in stream.next_out = output.ptr - stream.avail_out = output.size + avail_out = min(output.size, typemax(Cuint)) + stream.avail_out = avail_out code = compress!(stream, input.size > 0 ? BZ_RUN : BZ_FINISH) - Δin = Int(input.size - stream.avail_in) - Δout = Int(output.size - stream.avail_out) + Δin = Int(avail_in - stream.avail_in) + Δout = Int(avail_out - stream.avail_out) if code == BZ_RUN_OK || code == BZ_FINISH_OK return Δin, Δout, :ok elseif code == BZ_STREAM_END return Δin, Δout, :end else - error[] = BZ2Error(code) + error_ref[] = BZ2Error(code) return Δin, Δout, :error end end diff --git a/src/decompression.jl b/src/decompression.jl index 38220f1..8582b73 100644 --- a/src/decompression.jl +++ b/src/decompression.jl @@ -48,43 +48,62 @@ function TranscodingStreams.finalize(codec::Bzip2Decompressor) if codec.stream.state != C_NULL code = decompress_end!(codec.stream) if code != BZ_OK - bzerror(codec.stream, code) + bzerror(code) end end return end -function TranscodingStreams.startproc(codec::Bzip2Decompressor, ::Symbol, error::Error) +function TranscodingStreams.startproc(codec::Bzip2Decompressor, ::Symbol, error_ref::Error) if codec.stream.state != C_NULL code = decompress_end!(codec.stream) if code != BZ_OK - error[] = BZ2Error(code) + error_ref[] = BZ2Error(code) return :error end end code = decompress_init!(codec.stream, codec.verbosity, codec.small) - if code != BZ_OK - error[] = BZ2Error(code) - return :error + # errors in decompress_init! do not require clean up, so just throw + if code == BZ_OK + return :ok + elseif code == BZ_CONFIG_ERROR + error("BZ_CONFIG_ERROR: libbzip2 has been mis-compiled") + elseif code == BZ_PARAM_ERROR + error("BZ_PARAM_ERROR: this must be checked in Bzip2Decompressor constructor") + elseif code == BZ_MEM_ERROR + throw(OutOfMemoryError()) + else + error("unexpected libbzip2 error code: $(code)") end - return :ok end -function TranscodingStreams.process(codec::Bzip2Decompressor, input::Memory, output::Memory, error::Error) +function TranscodingStreams.process(codec::Bzip2Decompressor, input::Memory, output::Memory, error_ref::Error) stream = codec.stream + if stream.state == C_NULL + error("startproc must be called before process") + end stream.next_in = input.ptr - stream.avail_in = input.size + avail_in = min(input.size, typemax(Cuint)) + stream.avail_in = avail_in stream.next_out = output.ptr - stream.avail_out = output.size + avail_out = min(output.size, typemax(Cuint)) + stream.avail_out = avail_out code = decompress!(stream) - Δin = Int(input.size - stream.avail_in) - Δout = Int(output.size - stream.avail_out) + Δin = Int(avail_in - stream.avail_in) + Δout = Int(avail_out - stream.avail_out) if code == BZ_OK - return Δin, Δout, :ok + if iszero(input.size) && !iszero(stream.avail_out) + error_ref[] = BZ2Error(BZ_UNEXPECTED_EOF) + return Δin, Δout, :error + else + return Δin, Δout, :ok + end elseif code == BZ_STREAM_END return Δin, Δout, :end + elseif code == BZ_MEM_ERROR + throw(OutOfMemoryError()) else - error[] = BZ2Error(code) + error_ref[] = BZ2Error(code) return Δin, Δout, :error end end diff --git a/src/libbzip2.jl b/src/libbzip2.jl index 5736bac..48343f2 100644 --- a/src/libbzip2.jl +++ b/src/libbzip2.jl @@ -186,7 +186,29 @@ struct BZ2Error <: Exception code::Cint end -function bzerror(stream::BZStream, code::Cint) +function Base.showerror(io::IO, err::BZ2Error) + code = err.code + print(io, "BZ2Error: ") + if code == BZ_CONFIG_ERROR + print(io, "BZ_CONFIG_ERROR: the library has been improperly compiled on your platform") + elseif code == BZ_SEQUENCE_ERROR + print(io, "BZ_SEQUENCE_ERROR: invalid function sequence, there is a bug in CodecBzip2") + elseif code == BZ_PARAM_ERROR + print(io, "BZ_PARAM_ERROR: function parameter is out of range, there is a bug in CodecBzip2") + elseif code == BZ_DATA_ERROR + print(io, "BZ_DATA_ERROR: a data integrity error is detected in the compressed stream") + elseif code == BZ_DATA_ERROR_MAGIC + print(io, "BZ_DATA_ERROR_MAGIC: the compressed stream doesn't begin with the right magic bytes") + elseif code == BZ_UNEXPECTED_EOF + print(io, "BZ_UNEXPECTED_EOF: the compressed stream may be truncated") + else + print(io, "unknown bzip2 error code: ") + print(io, code) + end + nothing +end + +function bzerror(code::Cint) @assert code < 0 throw(BZ2Error(code)) end diff --git a/test/Project.toml b/test/Project.toml index 77ae80a..a86fb10 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TestsForCodecPackages = "c2e61002-3542-480d-8b3c-5f05cc4f8554" diff --git a/test/big-mem-tests.jl b/test/big-mem-tests.jl new file mode 100644 index 0000000..0856182 --- /dev/null +++ b/test/big-mem-tests.jl @@ -0,0 +1,50 @@ +# This file contains tests that require a large amount of memory (at least 25 GB) +# and take a long time to run. The tests are designed to check the +# compression and decompression functionality of the CodecBzip2 package +# with very large inputs. These tests are not run with CI + +using Test +using CodecBzip2 + +@testset "Big Memory Tests" begin + Sys.WORD_SIZE == 64 || error("tests require 64 bit word size") + @info "compressing zeros" + for n in (2^32 - 1, 2^32, 2^32 +1) + @info "compressing" + local c = transcode(Bzip2Compressor, zeros(UInt8, n)) + @info "decompressing" + local u = transcode(Bzip2Decompressor, c) + c = nothing + all_zero = all(iszero, u) + len_n = length(u) == n + @test all_zero && len_n + end + + @info "compressing random" + for n in (2^32 - 1, 2^32, 2^32 +1) + local u = rand(UInt8, n) + @info "compressing" + local c = transcode(Bzip2Compressor, u) + @info "decompressing" + local u2 = transcode(Bzip2Decompressor, c) + c = nothing + are_equal = u == u2 + @test are_equal + end + + @info "decompressing huge concatenation" + uncompressed = rand(UInt8, 2^20) + @info "compressing" + compressed = transcode(Bzip2Compressor, uncompressed) + total_compressed = UInt8[] + sizehint!(total_compressed, length(compressed)*2^12) + total_uncompressed = UInt8[] + sizehint!(total_uncompressed, length(uncompressed)*2^12) + for i in 1:2^12 + append!(total_uncompressed, uncompressed) + append!(total_compressed, compressed) + end + @test length(total_compressed) > 2^32 + @info "decompressing" + @test total_uncompressed == transcode(Bzip2Decompressor, total_compressed) +end diff --git a/test/runtests.jl b/test/runtests.jl index d01e397..e49198e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,16 @@ using CodecBzip2 using Test +using Aqua: Aqua import TranscodingStreams using TestsForCodecPackages: test_roundtrip_read, test_roundtrip_write, test_roundtrip_transcode, test_roundtrip_lines, - test_roundtrip_seekstart + test_roundtrip_seekstart, + test_reuse_encoder + +Aqua.test_all(CodecBzip2) @testset "Bzip2 Codec" begin codec = Bzip2Compressor() @@ -41,9 +45,45 @@ using TestsForCodecPackages: test_roundtrip_lines(Bzip2CompressorStream, Bzip2DecompressorStream) test_roundtrip_seekstart(Bzip2CompressorStream, Bzip2DecompressorStream) test_roundtrip_transcode(Bzip2Compressor, Bzip2Decompressor) + test_reuse_encoder(Bzip2Compressor, Bzip2Decompressor) @test_throws ArgumentError Bzip2Compressor(blocksize100k=10) @test_throws ArgumentError Bzip2Compressor(workfactor=251) @test_throws ArgumentError Bzip2Compressor(verbosity=5) @test_throws ArgumentError Bzip2Decompressor(verbosity=5) + + @testset "unexpected end of stream errors" begin + # issue #32 + local uncompressed = rand(UInt8, 1000) + local compressed = transcode(Bzip2Compressor, uncompressed) + for i in 0:length(compressed)-1 + @test_throws CodecBzip2.BZ2Error(CodecBzip2.BZ_UNEXPECTED_EOF) transcode(Bzip2Decompressor, compressed[1:i]) + end + @test transcode(Bzip2Decompressor, compressed) == uncompressed + # compressing empty vector should still work + @test transcode(Bzip2Decompressor, transcode(Bzip2Compressor, UInt8[])) == UInt8[] + end + @testset "data errors" begin + @test_throws CodecBzip2.BZ2Error(CodecBzip2.BZ_DATA_ERROR_MAGIC) transcode(Bzip2Decompressor, zeros(UInt8, 10)) + local uncompressed = rand(UInt8, 1000) + local compressed = transcode(Bzip2Compressor, uncompressed) + compressed[70] ⊻= 0x01 + @test_throws CodecBzip2.BZ2Error(CodecBzip2.BZ_DATA_ERROR) transcode(Bzip2Decompressor, compressed) + end + @testset "error printing" begin + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_CONFIG_ERROR)) == + "BZ2Error: BZ_CONFIG_ERROR: the library has been improperly compiled on your platform" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_SEQUENCE_ERROR)) == + "BZ2Error: BZ_SEQUENCE_ERROR: invalid function sequence, there is a bug in CodecBzip2" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_PARAM_ERROR)) == + "BZ2Error: BZ_PARAM_ERROR: function parameter is out of range, there is a bug in CodecBzip2" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_UNEXPECTED_EOF)) == + "BZ2Error: BZ_UNEXPECTED_EOF: the compressed stream may be truncated" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_DATA_ERROR)) == + "BZ2Error: BZ_DATA_ERROR: a data integrity error is detected in the compressed stream" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(CodecBzip2.BZ_DATA_ERROR_MAGIC)) == + "BZ2Error: BZ_DATA_ERROR_MAGIC: the compressed stream doesn't begin with the right magic bytes" + @test sprint(Base.showerror, CodecBzip2.BZ2Error(-100)) == + "BZ2Error: unknown bzip2 error code: -100" + end end