Skip to content

Commit

Permalink
FlexUInt/FlexInt encoding/decoding
Browse files Browse the repository at this point in the history
  • Loading branch information
zslayton committed Dec 16, 2023
1 parent 783dd93 commit 925d210
Show file tree
Hide file tree
Showing 4 changed files with 806 additions and 0 deletions.
253 changes: 253 additions & 0 deletions src/lazy/binary/immutable_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use crate::binary::var_int::VarInt;
use crate::binary::var_uint::VarUInt;
use crate::lazy::binary::encoded_value::EncodedValue;
use crate::lazy::binary::raw::value::LazyRawBinaryValue;
use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt;
use crate::result::IonFailure;
use crate::types::UInt;
use crate::{Int, IonError, IonResult, IonType};
Expand Down Expand Up @@ -157,6 +159,257 @@ impl<'a> ImmutableBuffer<'a> {
}
}

/// Reads a [`FlexInt`] from the buffer.
pub fn read_flex_int(self) -> ParseResult<'a, FlexInt> {
// A FlexInt has the same structure as a FlexUInt. We can read a FlexUInt and then re-interpret
// its unsigned bytes as two's complement bytes.
let (flex_uint, remaining) = self.read_flex_primitive_as_uint("reading a FlexInt", true)?;
let unsigned_value = flex_uint.value();

// If the encoded FlexInt required `N` bytes to encode where `N` is fewer than 8, then its
// u64 value will have `8 - N` leading zero bytes. If the highest bit in the encoding was a
// 1, then then number is negative and we need to flip all of those leading zeros to ones.
// Look at the original input to see if the highest bit was a zero (positive) or one (negative).
let last_explicit_byte = self.bytes()[flex_uint.size_in_bytes() - 1];
let sign_bit = 0b1000_0000 & last_explicit_byte;
let signed_value = if sign_bit == 0 {
unsigned_value as i64
} else {
// Flip all of the leading zeros in the unsigned value to ones and then re-interpret it
// as a signed value.
let mask = ((1u64 << 63) as i64) >> unsigned_value.leading_zeros();
(unsigned_value as i64) | mask
};

Ok((
FlexInt::new(flex_uint.size_in_bytes(), signed_value),
remaining,
))
}

/// Reads a [`FlexUInt`] from the buffer.
#[inline]
pub fn read_flex_uint(self) -> ParseResult<'a, FlexUInt> {
const COMMON_CASE_INPUT_BYTES_NEEDED: usize = 8;

// We want to minimize the number of branches that happen in the common case. To do this,
// we perform a single length check, making sure that the buffer contains enough data to
// represent a FlexUInt whose continuation bits fit in a single byte (i.e. one with 7 or
// fewer bytes of magnitude). If the buffer doesn't have at least 8 bytes in it or the
// FlexUInt we find requires more than 8 bytes to represent, we'll fall back to the general
// case.
if self.len() < COMMON_CASE_INPUT_BYTES_NEEDED || self.bytes()[0] == 0 {
// `read_flex_uint_slow` is marked #[cold] to discourage inlining it, which keeps
// this method small enough that the code for the common case can be inlined.
return self.read_flex_uint_slow("reading a FlexUInt");
}

let flex_uint = Self::read_small_flex_uint(self.bytes());
let remaining = self.consume(flex_uint.size_in_bytes());
Ok((flex_uint, remaining))
}

/// Helper method that reads a [`FlexUInt`] with 7 or fewer bytes of magnitude from the buffer.
// Caller must confirm that `bytes` has at least 8 bytes.
#[inline]
fn read_small_flex_uint(bytes: &[u8]) -> FlexUInt {
debug_assert!(bytes.len() >= 8);
let num_encoded_bytes = bytes[0].trailing_zeros() as usize + 1;
let num_encoded_bits = 8 * num_encoded_bytes;
// Get a mask with the low 'n' bits set
// TODO: Should this be a const cache of num_encoded_bits -> mask?
let mask = 1u64
.checked_shl(num_encoded_bits as u32)
.map(|v| v - 1)
.unwrap_or(u64::MAX);
// Convert our longer-than-8-bytes slice to a fixed sized 8-byte array that we can convert
// to a u64 directly.
let fixed_size_input: [u8; 8] = bytes[..8].try_into().unwrap();
// This step will often read unrelated bytes from beyond the FlexUInt, but they are
// discarded in the shift operation that follows.
let encoded_value = u64::from_le_bytes(fixed_size_input);
// Note that `num_encoded_bytes` is also the number of continuation flags that we need
// to discard via right shifting.
let value = (encoded_value & mask) >> num_encoded_bytes;
FlexUInt::new(num_encoded_bytes, value)
}

/// Helper method that reads a [`FlexUInt`] from the buffer when input is potentially
/// incomplete or when the `FlexUInt` requires more than 8 bytes to represent.
// Marked `cold` so the compiler can prioritize the common case instead.
#[cold]
fn read_flex_uint_slow(self, label: &'static str) -> ParseResult<'a, FlexUInt> {
self.read_flex_primitive_as_uint(label, false)
}

/// Helper method that reads a flex-encoded primitive from the buffer, returning it as a `FlexUInt`.
/// If an error occurs while reading, its description will include the supplied `label`.
///
/// The current implementation supports flex primitives with up to 64 bits of representation
/// beyond the leading header bits. Flex primitives requiring 10 bytes to encode have 70 magnitude
/// bits. If this value is unsigned (`support_sign_extension=false`), the six bits beyond the
/// supported 64 must all be `0`. If this value will later be re-interpreted as a signed value,
/// (`support_sign_extension=true`), then the six bits beyond the supported 64 must all be the
/// same as the 64th (highest supported) bit. This will allow encodings of up to 70 bits
/// to be correctly interpreted as positive, negative, or beyond the bounds of the 64 bit
/// limitation.
fn read_flex_primitive_as_uint(
self,
label: &'static str,
support_sign_extension: bool,
) -> ParseResult<'a, FlexUInt> {
// A closure that generates an incomplete data result at the current offset. This can be invoked
// in a variety of early-return cases in this method.
let incomplete = || IonResult::incomplete(label, self.offset);

let bytes_available = self.bytes().len();
if bytes_available == 0 {
return incomplete();
}

// The `from_le_bytes` method we use to interpret data requires at least 8 bytes to be available.
// There can be 1-2 bytes of header for a u64, leading to a maximum size of 10 bytes. If the input
// buffer doesn't have at least 10 bytes, copy its contents into a temporary buffer that's
// padded with 0 bytes. We round the size of the temp buffer to 16 as it produces slightly
// nicer assembly than 10.
let mut buffer = [0u8; 16];
let bytes = if bytes_available >= 10 {
self.bytes()
} else {
buffer[0..bytes_available].copy_from_slice(self.bytes());
&buffer[..]
};

let first_byte = bytes[0];
// If the first byte is not zero, the FlexUInt is 7 or fewer bytes.
if first_byte != 0 {
let num_encoded_bytes = first_byte.trailing_zeros() as usize + 1;
// Note that `bytes_available` is the number of bytes in the original unpadded input.
// Our buffer may be 16 bytes long but only `bytes_available` of those are meaningful.
if bytes_available < num_encoded_bytes {
return incomplete();
}
// At this point, we know the original input contained all of the FlexUInt's bytes.
// We can call `read_small_flex_uint` with the now-padded version of the buffer.
// It will discard any bytes that are not part of the FlexUInt.
let flex_uint = Self::read_small_flex_uint(bytes);
let remaining = self.consume(flex_uint.size_in_bytes());
return Ok((flex_uint, remaining));
}

// If we reach this point, the first byte was a zero. The FlexUInt is at least 9 bytes in size.
// We need to inspect the second byte to see how many more prefix bits there are.
if bytes_available < 2 {
return incomplete();
}
let second_byte = bytes[1];

if second_byte & 0b11 == 0b00 {
// The flag bits in the second byte indicate at least two more bytes, meaning the total
// length is more than 10 bytes. We're not equipped to handle this.
return IonResult::decoding_error("found a >10 byte VarUInt too large to fit in a u64");
}

if second_byte & 0b11 == 0b10 {
// The lowest bit of the second byte is empty, the next lowest is not. The encoding
// is 10 bytes.

if bytes_available < 10 {
return incomplete();
}

let flex_uint = self.read_10_byte_flex_primitive_as_uint(
support_sign_extension,
bytes,
second_byte,
)?;
let remaining = self.consume(flex_uint.size_in_bytes());
return Ok((flex_uint, remaining));
}

// The lowest bit of the second byte is set. The encoding is 9 bytes.
if bytes_available < 9 {
return incomplete();
}
// There are 57-63 bits of magnitude. We can decode the remaining bytes in a u64.
let remaining_data = &bytes[1..9];
// We know that the slice is 8 bytes long, so we can unwrap() the conversion to [u8; 8]
// Lop off the lowest bit to discard the `end` flag.
let value = u64::from_le_bytes(remaining_data[..8].try_into().unwrap()) >> 1;
let flex_uint = FlexUInt::new(9, value);
let remaining_input = self.consume(flex_uint.size_in_bytes());
Ok((flex_uint, remaining_input))
}

/// Helper method to handle flex primitives whose encoding requires 10 bytes. This case is
/// complex because it requires evaluating data beyond the supported 64 bits of representation
/// to detect overflow and support signed re-interpretation.
fn read_10_byte_flex_primitive_as_uint(
self,
support_sign_extension: bool,
bytes: &'a [u8],
second_byte: u8,
) -> IonResult<FlexUInt> {
// There are 10 prefix (continuation) bits, 64 bits of magnitude, and 6 bits of sign
// extension (if enabled). We cannot store the highest 6 bits, so this method just checks
// to make sure that they do not modify the meaning of the value in the lower 64 bits.
// For signed values, this means the 6 extra bits must all be the same as the 64th bit.
// For unsigned values, this means that the 6 extra bits must all be `1`.
//
// Little Endian byte diagram:
//
// b0 b1 b2 b3
// PPPPPPPP MMMMMMPP MMMMMMMM MMMMMMMM
// b4 b5 b6 b7
// MMMMMMMM MMMMMMMM MMMMMMMM MMMMMMMM
// b8 b9
// MMMMMMMM XXXXXXMM
//
// P = Prefix bit
// M = Magnitude bit
// X = An 'extra' bit; if `support_sign_extension` is true, these are sign bits.

// We've already processed the first byte, and we've looked at the lowest two bits of
// the second byte. Isolate the highest six bits of the second byte (b1) which represent
// the lowest six bits of the magnitude.
let magnitude_low_six = second_byte >> 2;
// Load the remaining 8 bytes into a u64 that we can easily shift/mask.
let remaining_data = &bytes[2..10];
// We know the slice is 8 bytes long, so we can `unwrap()` the conversion to [u8; 8]
let remaining_magnitude = u64::from_le_bytes(remaining_data.try_into().unwrap());

let sign_extension_bits = (remaining_magnitude & (0b111111 << 58)) >> 58;
if support_sign_extension {
// Something downstream intends to use this as a signed value; we need to make sure
// that bits 65-70 match bit 64. `remaining_magnitude` is storing 58 bits of data,
// so bit 64 of the value (bit index=63) is bit 58 (bit index=57) in `remaining_magnitude`.
let high_bit_is_set = remaining_magnitude & (1 << 57) != 0;
if (high_bit_is_set && sign_extension_bits != 0b111111)
|| (!high_bit_is_set && sign_extension_bits != 0)
{
// If the sign extension bits don't agree with the top bit, this value required
// more than 64 bits to encode.
return IonResult::decoding_error(
"found a 10-byte FlexInt too large to fit in a i64",
);
}
} else {
// This is an unsigned value; if any of the highest six bits are set, then this
// value is beyond the magnitude we can store in a u64.
if sign_extension_bits != 0 {
return IonResult::decoding_error(
"found a 10-byte FlexUInt too large to fit in a u64",
);
}
}

// Shift the magnitude from the last 8 bytes over and combine it with the six bits we
// carried over from the second byte.
let value = (remaining_magnitude << 6) | magnitude_low_six as u64;
let flex_uint = FlexUInt::new(10, value);
Ok(flex_uint)
}

/// Reads a `VarUInt` encoding primitive from the beginning of the buffer. If it is successful,
/// returns an `Ok(_)` containing its [VarUInt] representation.
///
Expand Down
Loading

0 comments on commit 925d210

Please sign in to comment.