FlexUInt/FlexInt encoding/decoding

amazon-ion · Dec 16, 2023 · 925d210 · 925d210
1 parent 783dd93
commit 925d210
Show file tree

Hide file tree

Showing 4 changed files with 806 additions and 0 deletions.
diff --git a/src/lazy/binary/immutable_buffer.rs b/src/lazy/binary/immutable_buffer.rs
@@ -8,6 +8,8 @@ use crate::binary::var_int::VarInt;
 use crate::binary::var_uint::VarUInt;
 use crate::lazy::binary::encoded_value::EncodedValue;
 use crate::lazy::binary::raw::value::LazyRawBinaryValue;
+use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt;
+use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt;
 use crate::result::IonFailure;
 use crate::types::UInt;
 use crate::{Int, IonError, IonResult, IonType};
@@ -157,6 +159,257 @@ impl<'a> ImmutableBuffer<'a> {
         }
     }
 
+    /// Reads a [`FlexInt`] from the buffer.
+    pub fn read_flex_int(self) -> ParseResult<'a, FlexInt> {
+        // A FlexInt has the same structure as a FlexUInt. We can read a FlexUInt and then re-interpret
+        // its unsigned bytes as two's complement bytes.
+        let (flex_uint, remaining) = self.read_flex_primitive_as_uint("reading a FlexInt", true)?;
+        let unsigned_value = flex_uint.value();
+
+        // If the encoded FlexInt required `N` bytes to encode where `N` is fewer than 8, then its
+        // u64 value will have `8 - N` leading zero bytes. If the highest bit in the encoding was a
+        // 1, then then number is negative and we need to flip all of those leading zeros to ones.
+        // Look at the original input to see if the highest bit was a zero (positive) or one (negative).
+        let last_explicit_byte = self.bytes()[flex_uint.size_in_bytes() - 1];
+        let sign_bit = 0b1000_0000 & last_explicit_byte;
+        let signed_value = if sign_bit == 0 {
+            unsigned_value as i64
+        } else {
+            // Flip all of the leading zeros in the unsigned value to ones and then re-interpret it
+            // as a signed value.
+            let mask = ((1u64 << 63) as i64) >> unsigned_value.leading_zeros();
+            (unsigned_value as i64) | mask
+        };
+
+        Ok((
+            FlexInt::new(flex_uint.size_in_bytes(), signed_value),
+            remaining,
+        ))
+    }
+
+    /// Reads a [`FlexUInt`] from the buffer.
+    #[inline]
+    pub fn read_flex_uint(self) -> ParseResult<'a, FlexUInt> {
+        const COMMON_CASE_INPUT_BYTES_NEEDED: usize = 8;
+
+        // We want to minimize the number of branches that happen in the common case. To do this,
+        // we perform a single length check, making sure that the buffer contains enough data to
+        // represent a FlexUInt whose continuation bits fit in a single byte (i.e. one with 7 or
+        // fewer bytes of magnitude). If the buffer doesn't have at least 8 bytes in it or the
+        // FlexUInt we find requires more than 8 bytes to represent, we'll fall back to the general
+        // case.
+        if self.len() < COMMON_CASE_INPUT_BYTES_NEEDED || self.bytes()[0] == 0 {
+            // `read_flex_uint_slow` is marked #[cold] to discourage inlining it, which keeps
+            // this method small enough that the code for the common case can be inlined.
+            return self.read_flex_uint_slow("reading a FlexUInt");
+        }
+
+        let flex_uint = Self::read_small_flex_uint(self.bytes());
+        let remaining = self.consume(flex_uint.size_in_bytes());
+        Ok((flex_uint, remaining))
+    }
+
+    /// Helper method that reads a [`FlexUInt`] with 7 or fewer bytes of magnitude from the buffer.
+    // Caller must confirm that `bytes` has at least 8 bytes.
+    #[inline]
+    fn read_small_flex_uint(bytes: &[u8]) -> FlexUInt {
+        debug_assert!(bytes.len() >= 8);
+        let num_encoded_bytes = bytes[0].trailing_zeros() as usize + 1;
+        let num_encoded_bits = 8 * num_encoded_bytes;
+        // Get a mask with the low 'n' bits set
+        // TODO: Should this be a const cache of num_encoded_bits -> mask?
+        let mask = 1u64
+            .checked_shl(num_encoded_bits as u32)
+            .map(|v| v - 1)
+            .unwrap_or(u64::MAX);
+        // Convert our longer-than-8-bytes slice to a fixed sized 8-byte array that we can convert
+        // to a u64 directly.
+        let fixed_size_input: [u8; 8] = bytes[..8].try_into().unwrap();
+        // This step will often read unrelated bytes from beyond the FlexUInt, but they are
+        // discarded in the shift operation that follows.
+        let encoded_value = u64::from_le_bytes(fixed_size_input);
+        // Note that `num_encoded_bytes` is also the number of continuation flags that we need
+        // to discard via right shifting.
+        let value = (encoded_value & mask) >> num_encoded_bytes;
+        FlexUInt::new(num_encoded_bytes, value)
+    }
+
+    /// Helper method that reads a [`FlexUInt`] from the buffer when input is potentially
+    /// incomplete or when the `FlexUInt` requires more than 8 bytes to represent.
+    // Marked `cold` so the compiler can prioritize the common case instead.
+    #[cold]
+    fn read_flex_uint_slow(self, label: &'static str) -> ParseResult<'a, FlexUInt> {
+        self.read_flex_primitive_as_uint(label, false)
+    }
+
+    /// Helper method that reads a flex-encoded primitive from the buffer, returning it as a `FlexUInt`.
+    /// If an error occurs while reading, its description will include the supplied `label`.
+    ///
+    /// The current implementation supports flex primitives with up to 64 bits of representation
+    /// beyond the leading header bits. Flex primitives requiring 10 bytes to encode have 70 magnitude
+    /// bits. If this value is unsigned (`support_sign_extension=false`), the six bits beyond the
+    /// supported 64 must all be `0`. If this value will later be re-interpreted as a signed value,
+    /// (`support_sign_extension=true`), then the six bits beyond the supported 64 must all be the
+    /// same as the 64th (highest supported) bit. This will allow encodings of up to 70 bits
+    /// to be correctly interpreted as positive, negative, or beyond the bounds of the 64 bit
+    /// limitation.  
+    fn read_flex_primitive_as_uint(
+        self,
+        label: &'static str,
+        support_sign_extension: bool,
+    ) -> ParseResult<'a, FlexUInt> {
+        // A closure that generates an incomplete data result at the current offset. This can be invoked
+        // in a variety of early-return cases in this method.
+        let incomplete = || IonResult::incomplete(label, self.offset);
+
+        let bytes_available = self.bytes().len();
+        if bytes_available == 0 {
+            return incomplete();
+        }
+
+        // The `from_le_bytes` method we use to interpret data requires at least 8 bytes to be available.
+        // There can be 1-2 bytes of header for a u64, leading to a maximum size of 10 bytes. If the input
+        // buffer doesn't have at least 10 bytes, copy its contents into a temporary buffer that's
+        // padded with 0 bytes. We round the size of the temp buffer to 16 as it produces slightly
+        // nicer assembly than 10.
+        let mut buffer = [0u8; 16];
+        let bytes = if bytes_available >= 10 {
+            self.bytes()
+        } else {
+            buffer[0..bytes_available].copy_from_slice(self.bytes());
+            &buffer[..]
+        };
+
+        let first_byte = bytes[0];
+        // If the first byte is not zero, the FlexUInt is 7 or fewer bytes.
+        if first_byte != 0 {
+            let num_encoded_bytes = first_byte.trailing_zeros() as usize + 1;
+            // Note that `bytes_available` is the number of bytes in the original unpadded input.
+            // Our buffer may be 16 bytes long but only `bytes_available` of those are meaningful.
+            if bytes_available < num_encoded_bytes {
+                return incomplete();
+            }
+            // At this point, we know the original input contained all of the FlexUInt's bytes.
+            // We can call `read_small_flex_uint` with the now-padded version of the buffer.
+            // It will discard any bytes that are not part of the FlexUInt.
+            let flex_uint = Self::read_small_flex_uint(bytes);
+            let remaining = self.consume(flex_uint.size_in_bytes());
+            return Ok((flex_uint, remaining));
+        }
+
+        // If we reach this point, the first byte was a zero. The FlexUInt is at least 9 bytes in size.
+        // We need to inspect the second byte to see how many more prefix bits there are.
+        if bytes_available < 2 {
+            return incomplete();
+        }
+        let second_byte = bytes[1];
+
+        if second_byte & 0b11 == 0b00 {
+            // The flag bits in the second byte indicate at least two more bytes, meaning the total
+            // length is more than 10 bytes. We're not equipped to handle this.
+            return IonResult::decoding_error("found a >10 byte VarUInt too large to fit in a u64");
+        }
+
+        if second_byte & 0b11 == 0b10 {
+            // The lowest bit of the second byte is empty, the next lowest is not. The encoding
+            // is 10 bytes.
+
+            if bytes_available < 10 {
+                return incomplete();
+            }
+
+            let flex_uint = self.read_10_byte_flex_primitive_as_uint(
+                support_sign_extension,
+                bytes,
+                second_byte,
+            )?;
+            let remaining = self.consume(flex_uint.size_in_bytes());
+            return Ok((flex_uint, remaining));
+        }
+
+        // The lowest bit of the second byte is set. The encoding is 9 bytes.
+        if bytes_available < 9 {
+            return incomplete();
+        }
+        // There are 57-63 bits of magnitude. We can decode the remaining bytes in a u64.
+        let remaining_data = &bytes[1..9];
+        // We know that the slice is 8 bytes long, so we can unwrap() the conversion to [u8; 8]
+        // Lop off the lowest bit to discard the `end` flag.
+        let value = u64::from_le_bytes(remaining_data[..8].try_into().unwrap()) >> 1;
+        let flex_uint = FlexUInt::new(9, value);
+        let remaining_input = self.consume(flex_uint.size_in_bytes());
+        Ok((flex_uint, remaining_input))
+    }
+
+    /// Helper method to handle flex primitives whose encoding requires 10 bytes. This case is
+    /// complex because it requires evaluating data beyond the supported 64 bits of representation
+    /// to detect overflow and support signed re-interpretation.
+    fn read_10_byte_flex_primitive_as_uint(
+        self,
+        support_sign_extension: bool,
+        bytes: &'a [u8],
+        second_byte: u8,
+    ) -> IonResult<FlexUInt> {
+        // There are 10 prefix (continuation) bits, 64 bits of magnitude, and 6 bits of sign
+        // extension (if enabled). We cannot store the highest 6 bits, so this method just checks
+        // to make sure that they do not modify the meaning of the value in the lower 64 bits.
+        // For signed values, this means the 6 extra bits must all be the same as the 64th bit.
+        // For unsigned values, this means that the 6 extra bits must all be `1`.
+        //
+        // Little Endian byte diagram:
+        //
+        //      b0       b1       b2       b3
+        //   PPPPPPPP MMMMMMPP MMMMMMMM MMMMMMMM
+        //      b4       b5       b6       b7
+        //   MMMMMMMM MMMMMMMM MMMMMMMM MMMMMMMM
+        //      b8       b9
+        //   MMMMMMMM XXXXXXMM
+        //
+        // P = Prefix bit
+        // M = Magnitude bit
+        // X = An 'extra' bit; if `support_sign_extension` is true, these are sign bits.
+
+        // We've already processed the first byte, and we've looked at the lowest two bits of
+        // the second byte. Isolate the highest six bits of the second byte (b1) which represent
+        // the lowest six bits of the magnitude.
+        let magnitude_low_six = second_byte >> 2;
+        // Load the remaining 8 bytes into a u64 that we can easily shift/mask.
+        let remaining_data = &bytes[2..10];
+        // We know the slice is 8 bytes long, so we can `unwrap()` the conversion to [u8; 8]
+        let remaining_magnitude = u64::from_le_bytes(remaining_data.try_into().unwrap());
+
+        let sign_extension_bits = (remaining_magnitude & (0b111111 << 58)) >> 58;
+        if support_sign_extension {
+            // Something downstream intends to use this as a signed value; we need to make sure
+            // that bits 65-70 match bit 64. `remaining_magnitude` is storing 58 bits of data,
+            // so bit 64 of the value (bit index=63) is bit 58 (bit index=57) in `remaining_magnitude`.
+            let high_bit_is_set = remaining_magnitude & (1 << 57) != 0;
+            if (high_bit_is_set && sign_extension_bits != 0b111111)
+                || (!high_bit_is_set && sign_extension_bits != 0)
+            {
+                // If the sign extension bits don't agree with the top bit, this value required
+                // more than 64 bits to encode.
+                return IonResult::decoding_error(
+                    "found a 10-byte FlexInt too large to fit in a i64",
+                );
+            }
+        } else {
+            // This is an unsigned value; if any of the highest six bits are set, then this
+            // value is beyond the magnitude we can store in a u64.
+            if sign_extension_bits != 0 {
+                return IonResult::decoding_error(
+                    "found a 10-byte FlexUInt too large to fit in a u64",
+                );
+            }
+        }
+
+        // Shift the magnitude from the last 8 bytes over and combine it with the six bits we
+        // carried over from the second byte.
+        let value = (remaining_magnitude << 6) | magnitude_low_six as u64;
+        let flex_uint = FlexUInt::new(10, value);
+        Ok(flex_uint)
+    }
+
     /// Reads a `VarUInt` encoding primitive from the beginning of the buffer. If it is successful,
     /// returns an `Ok(_)` containing its [VarUInt] representation.
     ///