From dedcb3994db0259d663e170dae922ac61fe383fe Mon Sep 17 00:00:00 2001 From: Folkert Date: Mon, 12 Feb 2024 15:40:13 +0100 Subject: [PATCH 1/5] refactor --- zlib-rs/src/inflate.rs | 6 +----- zlib-rs/src/read_buf.rs | 46 ++++++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/zlib-rs/src/inflate.rs b/zlib-rs/src/inflate.rs index 64c043c5..1ebd3bc4 100644 --- a/zlib-rs/src/inflate.rs +++ b/zlib-rs/src/inflate.rs @@ -1237,11 +1237,7 @@ fn inflate_fast_help(state: &mut State, _start: usize) -> ReturnCode { // may need some bytes from the output if op < len as usize { - let len = len as usize - op; - - writer.copy_match(dist as usize, len); - } else { - // nothing? + writer.copy_match(dist as usize, len as usize - op); } } else if extra_safe { todo!() diff --git a/zlib-rs/src/read_buf.rs b/zlib-rs/src/read_buf.rs index 69400dfb..1cdd774f 100644 --- a/zlib-rs/src/read_buf.rs +++ b/zlib-rs/src/read_buf.rs @@ -333,15 +333,46 @@ impl<'a> ReadBuf<'a> { #[inline(always)] pub fn copy_match(&mut self, offset_from_end: usize, length: usize) { + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + return self.copy_match_avx2(offset_from_end, length); + } + + return self.copy_match_generic(offset_from_end, length); + } + + fn copy_match_generic(&mut self, offset_from_end: usize, length: usize) { let current = self.filled; let start = current.checked_sub(offset_from_end).expect("in bounds"); let end = start.checked_add(length).expect("in bounds"); - // Note also that the referenced string may overlap the current - // position; for example, if the last 2 bytes decoded have values - // X and Y, a string reference with - // adds X,Y,X,Y,X to the output stream. + if end > current { + if offset_from_end == 1 { + // this will just repeat this value many times + let element = self.buf[current - 1]; + self.buf[current..][..length].fill(element); + } else { + for i in 0..length { + self.buf[current + i] = self.buf[start + i]; + } + } + } else { + self.buf.copy_within(start..end, current); + } + + // safety: we just copied length initialized bytes right beyond self.filled + unsafe { self.assume_init(length) }; + + self.advance(length); + } + + #[cfg(target_arch = "x86_64")] + fn copy_match_avx2(&mut self, offset_from_end: usize, length: usize) { + let current = self.filled; + + let start = current.checked_sub(offset_from_end).expect("in bounds"); + let end = start.checked_add(length).expect("in bounds"); if end > self.filled { if offset_from_end == 1 { @@ -355,11 +386,9 @@ impl<'a> ReadBuf<'a> { } } else { let (before, after) = self.buf.split_at_mut(current); - let (d, _) = slice_as_chunks_mut::<_, 32>(after); - let chunk_count = (end - start).div_ceil(32); - if d.len() >= chunk_count { - for (s, d) in before[start..end].chunks(32).zip(d) { + if after.len() / 32 >= (end - start).div_ceil(32) { + for (s, d) in before[start..end].chunks(32).zip(after.chunks_mut(32)) { use std::arch::x86_64::{_mm256_loadu_si256, _mm256_storeu_si256}; unsafe { @@ -368,6 +397,7 @@ impl<'a> ReadBuf<'a> { } } } else { + // a full simd copy does not fit in the output buffer self.buf.copy_within(start..end, current); } } From 18e0057b3fab5f514d772ab0573915da308897c2 Mon Sep 17 00:00:00 2001 From: Folkert Date: Tue, 13 Feb 2024 15:02:57 +0100 Subject: [PATCH 2/5] try a trait --- zlib-rs/src/read_buf.rs | 314 ++++++++++++++-------------------------- 1 file changed, 109 insertions(+), 205 deletions(-) diff --git a/zlib-rs/src/read_buf.rs b/zlib-rs/src/read_buf.rs index 1cdd774f..7432b20a 100644 --- a/zlib-rs/src/read_buf.rs +++ b/zlib-rs/src/read_buf.rs @@ -333,6 +333,23 @@ impl<'a> ReadBuf<'a> { #[inline(always)] pub fn copy_match(&mut self, offset_from_end: usize, length: usize) { + let current = self.filled; + // println!("({current}, {offset_from_end}, {length}),"); + + if false && current < (1 << 15) - 300 { + let mut f = std::fs::File::options() + .write(true) + .create(true) + .append(true) + .open("/tmp/copy_match.dat") + .unwrap(); + + use std::io::Write; + f.write_all(¤t.to_ne_bytes()); + f.write_all(&offset_from_end.to_ne_bytes()); + f.write_all(&length.to_ne_bytes()); + } + #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { return self.copy_match_avx2(offset_from_end, length); @@ -341,7 +358,7 @@ impl<'a> ReadBuf<'a> { return self.copy_match_generic(offset_from_end, length); } - fn copy_match_generic(&mut self, offset_from_end: usize, length: usize) { + pub fn copy_match_generic(&mut self, offset_from_end: usize, length: usize) { let current = self.filled; let start = current.checked_sub(offset_from_end).expect("in bounds"); @@ -368,17 +385,32 @@ impl<'a> ReadBuf<'a> { } #[cfg(target_arch = "x86_64")] - fn copy_match_avx2(&mut self, offset_from_end: usize, length: usize) { + pub fn copy_match_sse(&mut self, offset_from_end: usize, length: usize) { let current = self.filled; let start = current.checked_sub(offset_from_end).expect("in bounds"); let end = start.checked_add(length).expect("in bounds"); + let safe_to_chunk = (current + length).next_multiple_of(16) <= self.buf.len(); + if end > self.filled { if offset_from_end == 1 { + use std::arch::x86_64::{_mm_set1_epi8, _mm_storeu_si128}; + // this will just repeat this value many times let element = self.buf[current - 1]; - self.buf[current..][..length].fill(element); + let b = unsafe { element.assume_init() }; + + if safe_to_chunk { + let chunk = unsafe { std::arch::x86_64::_mm_set1_epi8(b as i8) }; + for d in self.buf[current..][..length].chunks_mut(16) { + unsafe { + _mm_storeu_si128(d.as_mut_ptr().cast(), chunk); + } + } + } else { + self.buf[current..][..length].fill(element); + } } else { for i in 0..length { self.buf[current + i] = self.buf[start + i]; @@ -387,13 +419,13 @@ impl<'a> ReadBuf<'a> { } else { let (before, after) = self.buf.split_at_mut(current); - if after.len() / 32 >= (end - start).div_ceil(32) { - for (s, d) in before[start..end].chunks(32).zip(after.chunks_mut(32)) { - use std::arch::x86_64::{_mm256_loadu_si256, _mm256_storeu_si256}; + if safe_to_chunk { + for (s, d) in before[start..end].chunks(16).zip(after.chunks_mut(16)) { + use std::arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}; unsafe { - let chunk = _mm256_loadu_si256(s.as_ptr().cast()); - _mm256_storeu_si256(d.as_mut_ptr().cast(), chunk); + let chunk = _mm_loadu_si128(s.as_ptr().cast()); + _mm_storeu_si128(d.as_mut_ptr().cast(), chunk); } } } else { @@ -407,6 +439,37 @@ impl<'a> ReadBuf<'a> { self.advance(length); } + + #[cfg(target_arch = "x86_64")] + pub fn copy_match_avx2(&mut self, offset_from_end: usize, length: usize) { + let current = self.filled; + + let start = current.checked_sub(offset_from_end).expect("in bounds"); + let end = start.checked_add(length).expect("in bounds"); + + let safe_to_chunk = (current + length).next_multiple_of(32) <= self.buf.len(); + + if end > self.filled { + if offset_from_end == 1 { + // this will just repeat this value many times + let element = self.buf[current - 1]; + let b = unsafe { element.assume_init() }; + + self.buf[current..][..length].fill(element); + } else { + for i in 0..length { + self.buf[current + i] = self.buf[start + i]; + } + } + } else { + Avx2::copy_chunk(self.buf, current, start, end) + } + + // safety: we just copied length initialized bytes right beyond self.filled + unsafe { self.assume_init(length) }; + + self.advance(length); + } } fn slice_as_chunks(slice: &[T]) -> (&[[T; N]], &[T]) { @@ -473,231 +536,72 @@ unsafe fn slice_assume_init_mut(slice: &mut [MaybeUninit]) -> &mut [u8] { &mut *(slice as *mut [MaybeUninit] as *mut [u8]) } -trait ChunkSet { +trait ChunkCopy { const N: usize = core::mem::size_of::(); type Chunk; - unsafe fn memset_2(from: *const u8, chunk: &mut Self::Chunk); - unsafe fn memset_4(from: *const u8, chunk: &mut Self::Chunk); - unsafe fn memset_8(from: *const u8, chunk: &mut Self::Chunk); + /// Safety: must be valid to read a `Self::Chunk` value from `from` with an unaligned read. + unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk; - unsafe fn loadchunk(from: *const u8, chunk: &mut Self::Chunk); - unsafe fn storechunk(out: *mut u8, chunk: &Self::Chunk); + /// Safety: must be valid to write a `Self::Chunk` value to `out` with an unaligned write. + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk); #[inline(always)] - unsafe fn get_chunk_mag(buf: *const u8, chunk_rem: &mut usize, dist: usize) -> Self::Chunk { - let mut bytes_remaining = Self::N; - - let mut chunk = MaybeUninit::zeroed().assume_init(); - let mut cur_chunk = &mut chunk as *mut Self::Chunk as *mut u8; - - while bytes_remaining > 0 { - let cpy_dist = Ord::min(dist, bytes_remaining); - std::ptr::copy_nonoverlapping(buf, cur_chunk, cpy_dist); - bytes_remaining -= cpy_dist; - cur_chunk = cur_chunk.add(cpy_dist); - - // saves an expensive integer division (somewhere) - *chunk_rem = cpy_dist; - } - - chunk - } - - unsafe fn chunkcopy(mut out: *mut u8, mut from: *const u8, mut len: usize) -> *mut u8 { - assert!(len > 0, "chunkcopy should never have a length 0"); - - let mut chunk = MaybeUninit::zeroed().assume_init(); - - let align = ((len - 1) % Self::N) + 1; - Self::loadchunk(from, &mut chunk); - Self::storechunk(out, &chunk); - - out = out.add(align); - from = from.add(align); - len -= align; - - while len > 0 { - Self::loadchunk(from, &mut chunk); - Self::storechunk(out, &chunk); - - out = out.add(Self::N); - from = from.add(Self::N); - len -= Self::N; - } - - out - } + fn copy_chunk(buf: &mut [MaybeUninit], current: usize, start: usize, end: usize) { + let (before, after) = buf.split_at_mut(current); - unsafe fn chunkunroll(mut out: *mut u8, dist: &mut usize, len: &mut usize) -> *mut u8 { - let from = out.sub(*dist); - let mut chunk = MaybeUninit::zeroed().assume_init(); - - while *dist < *len && *dist < Self::N { - Self::loadchunk(from, &mut chunk); - Self::storechunk(out, &chunk); - - out = out.add(*dist); - *len -= *dist; - *dist += *dist; - } - - out - } - - unsafe fn chunkmemset(mut out: *mut u8, dist: usize, mut len: usize) -> *mut u8 { - assert!(dist > 0, "chunkmemset cannot have a distance 0"); - - let from = out.sub(dist); - - let mut chunk_load = MaybeUninit::zeroed().assume_init(); - let mut chunk_mod = 0; - - match dist { - 1 => { - std::ptr::write_bytes(out, *from, len); - return out.add(len); - } - _ if dist > Self::N => { - return Self::chunkcopy(out, out.sub(dist), len); - } - 2 => { - Self::memset_2(from, &mut chunk_load); - } - 4 => { - Self::memset_4(from, &mut chunk_load); - } - 8 => { - Self::memset_8(from, &mut chunk_load); - } - _ if dist == Self::N => { - Self::loadchunk(from, &mut chunk_load); - } - _ => { - chunk_load = Self::get_chunk_mag(from, &mut chunk_mod, dist); - } - } - - if chunk_mod == 0 { - while len >= 2 * Self::N { - Self::storechunk(out, &chunk_load); - Self::storechunk(out.add(Self::N), &chunk_load); - out = out.add(2 * Self::N); - len -= 2 * Self::N; - } - } - - let adv_amount = Self::N - chunk_mod; - assert!(adv_amount != 0, "{:?}", (Self::N, chunk_mod)); - while len >= Self::N { - Self::storechunk(out, &chunk_load); - len -= adv_amount; - out = out.add(adv_amount); - } - - if len != 0 { - std::ptr::copy_nonoverlapping(&chunk_load as *const _ as *const u8, out, len); - out = out.add(len); + if (end - start).next_multiple_of(32) <= after.len() { + let src = &before[start..end]; + let dst = after; + unsafe { Self::copy_chunk_unchecked(src, dst) } + } else { + // a full simd copy does not fit in the output buffer + buf.copy_within(start..end, current); } - - out } - unsafe fn chunkmemset_safe( - mut out: *mut u8, - dist: usize, - mut len: usize, - mut left: usize, - ) -> *mut u8 { - // TODO unaligned optimizations? - const ALIGN_MASK: usize = 7; - - len = Ord::min(len, left); - let mut from = out.sub(dist); - - while (out as usize & ALIGN_MASK) != 0 && len > 0 { - *out = *from; - out = out.add(1); - from = from.add(1); - - len -= 1; - left -= 1; - } - - if left < (3 * Self::N) { - while len > 0 { - *out = *from; - out = out.add(1); - from = from.add(1); - len -= 1; - } - return out; - } + /// # Safety + /// + /// - src.len().div_ceil(Self::N) >= dst.div_ceil(Self::N) + #[inline(always)] + unsafe fn copy_chunk_unchecked(src: &[MaybeUninit], dst: &mut [MaybeUninit]) { + // if this condition is false, the final simd write will go out of bounds + debug_assert!(src.len().div_ceil(Self::N) >= dst.len().div_ceil(Self::N)); - if len != 0 { - return Self::chunkmemset(out, dist, len); + for (s, d) in src.chunks(Self::N).zip(dst.chunks_mut(Self::N)) { + let chunk = Self::load_chunk(s.as_ptr()); + Self::store_chunk(d.as_mut_ptr(), chunk); } - - out } } -struct Standard; +struct Generic; -impl ChunkSet for Standard { +impl ChunkCopy for Generic { type Chunk = u64; - unsafe fn memset_2(from: *const u8, chunk: &mut Self::Chunk) { - let [a, b]: [u8; 2] = std::ptr::read(from.cast()); - *chunk = u64::from_ne_bytes([a, b, 0, 0, 0, 0, 0, 0]); - } - - unsafe fn memset_4(from: *const u8, chunk: &mut Self::Chunk) { - let [a, b, c, d]: [u8; 4] = std::ptr::read(from.cast()); - *chunk = u64::from_ne_bytes([a, b, c, d, 0, 0, 0, 0]); + unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk { + std::ptr::read_unaligned(from.cast()) } - unsafe fn memset_8(from: *const u8, chunk: &mut Self::Chunk) { - let tmp: [u8; 8] = std::ptr::read(from.cast()); - *chunk = u64::from_ne_bytes(tmp); - } - - unsafe fn loadchunk(from: *const u8, chunk: &mut Self::Chunk) { - let tmp: [u8; 8] = std::ptr::read(from.cast()); - *chunk = u64::from_ne_bytes(tmp); - } - - unsafe fn storechunk(out: *mut u8, chunk: &Self::Chunk) { - std::ptr::write(out as *mut [u8; 8], chunk.to_ne_bytes()); + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk) { + std::ptr::copy_nonoverlapping(chunk.to_ne_bytes().as_ptr().cast(), out, Self::N) } } struct Avx2; -impl ChunkSet for Avx2 { - type Chunk = std::arch::x86_64::__m256i; - - unsafe fn memset_2(from: *const u8, chunk: &mut Self::Chunk) { - let tmp: i16 = std::ptr::read_unaligned(from.cast()); - *chunk = std::arch::x86_64::_mm256_set1_epi16(tmp); - } - - unsafe fn memset_4(from: *const u8, chunk: &mut Self::Chunk) { - let tmp: i32 = std::ptr::read_unaligned(from.cast()); - *chunk = std::arch::x86_64::_mm256_set1_epi32(tmp); - } - - unsafe fn memset_8(from: *const u8, chunk: &mut Self::Chunk) { - let tmp: i64 = std::ptr::read_unaligned(from.cast()); - *chunk = std::arch::x86_64::_mm256_set1_epi64x(tmp); - } +impl ChunkCopy for Avx2 { + type Chunk = core::arch::x86_64::__m256i; - unsafe fn loadchunk(from: *const u8, chunk: &mut Self::Chunk) { - *chunk = std::arch::x86_64::_mm256_loadu_si256(from.cast()); + #[inline(always)] + unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk { + core::arch::x86_64::_mm256_loadu_si256(from.cast()) } - unsafe fn storechunk(out: *mut u8, chunk: &Self::Chunk) { - std::arch::x86_64::_mm256_storeu_si256(out as *mut Self::Chunk, *chunk); + #[inline(always)] + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk) { + core::arch::x86_64::_mm256_storeu_si256(out as *mut Self::Chunk, chunk); } } From f6cb824ed4989fa187ea1eadb21556486aeee76d Mon Sep 17 00:00:00 2001 From: Folkert Date: Tue, 13 Feb 2024 17:24:19 +0100 Subject: [PATCH 3/5] clean up simd primitives --- zlib-rs/src/read_buf.rs | 295 ++++++++++++++++++---------------------- 1 file changed, 136 insertions(+), 159 deletions(-) diff --git a/zlib-rs/src/read_buf.rs b/zlib-rs/src/read_buf.rs index 7432b20a..17440efc 100644 --- a/zlib-rs/src/read_buf.rs +++ b/zlib-rs/src/read_buf.rs @@ -295,70 +295,65 @@ impl<'a> ReadBuf<'a> { "buf.len() must fit in remaining()" ); - let amt = buf.len(); - // Cannot overflow, asserted above - let end = self.filled + amt; - - // // Safety: the length is asserted above - // unsafe { - // self.buf[self.filled..end] - // .as_mut_ptr() - // .cast::() - // .copy_from_nonoverlapping(buf.as_ptr(), amt); - // } - - let (it1, remainder) = slice_as_chunks::<_, 32>(buf); - let (it2, _) = slice_as_chunks_mut::<_, 32>(&mut self.buf[self.filled..]); - - for (d, s) in it2.iter_mut().zip(it1.iter()) { - use std::arch::x86_64::{_mm256_loadu_si256, _mm256_storeu_si256}; - unsafe { - let chunk = _mm256_loadu_si256(s.as_ptr().cast()); - _mm256_storeu_si256(d.as_mut_ptr().cast(), chunk); + 'blk: { + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx512f") { + break 'blk Self::copy_chunked_from_slice::( + self.buf, + self.filled, + buf, + ); } - } - unsafe { - self.buf[self.filled + buf.len() - remainder.len()..] - .as_mut_ptr() - .cast::() - .copy_from_nonoverlapping(remainder.as_ptr(), remainder.len()); - } + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx2") { + break 'blk Self::copy_chunked_from_slice::( + self.buf, + self.filled, + buf, + ); + } - if self.initialized < end { - self.initialized = end; - } + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("sse") { + break 'blk Self::copy_chunked_from_slice::( + self.buf, + self.filled, + buf, + ); + } + + Self::copy_chunked_from_slice::(self.buf, self.filled, buf) + }; + + let end = self.filled + buf.len(); + self.initialized = Ord::max(self.initialized, end); self.filled = end; } #[inline(always)] pub fn copy_match(&mut self, offset_from_end: usize, length: usize) { let current = self.filled; - // println!("({current}, {offset_from_end}, {length}),"); - - if false && current < (1 << 15) - 300 { - let mut f = std::fs::File::options() - .write(true) - .create(true) - .append(true) - .open("/tmp/copy_match.dat") - .unwrap(); - - use std::io::Write; - f.write_all(¤t.to_ne_bytes()); - f.write_all(&offset_from_end.to_ne_bytes()); - f.write_all(&length.to_ne_bytes()); + + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx512f") { + return self.copy_match_help::(offset_from_end, length); } #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { - return self.copy_match_avx2(offset_from_end, length); + return self.copy_match_help::(offset_from_end, length); + } + + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("sse") { + return self.copy_match_help::(offset_from_end, length); } - return self.copy_match_generic(offset_from_end, length); + return self.copy_match_help::(offset_from_end, length); } - pub fn copy_match_generic(&mut self, offset_from_end: usize, length: usize) { + fn copy_match_help(&mut self, offset_from_end: usize, length: usize) { let current = self.filled; let start = current.checked_sub(offset_from_end).expect("in bounds"); @@ -375,7 +370,7 @@ impl<'a> ReadBuf<'a> { } } } else { - self.buf.copy_within(start..end, current); + Self::copy_chunked_within::(self.buf, current, start, end) } // safety: we just copied length initialized bytes right beyond self.filled @@ -384,91 +379,65 @@ impl<'a> ReadBuf<'a> { self.advance(length); } - #[cfg(target_arch = "x86_64")] - pub fn copy_match_sse(&mut self, offset_from_end: usize, length: usize) { - let current = self.filled; + #[inline(always)] + fn copy_chunked_from_slice(buf: &mut [MaybeUninit], current: usize, src: &[u8]) { + assert!(buf.len() - current > src.len()); - let start = current.checked_sub(offset_from_end).expect("in bounds"); - let end = start.checked_add(length).expect("in bounds"); + let mut it = src.chunks_exact(core::mem::size_of::()); - let safe_to_chunk = (current + length).next_multiple_of(16) <= self.buf.len(); + unsafe { + let mut dst = buf.as_mut_ptr().add(current); - if end > self.filled { - if offset_from_end == 1 { - use std::arch::x86_64::{_mm_set1_epi8, _mm_storeu_si128}; + for c in &mut it { + let chunk = C::load_chunk(c.as_ptr().cast()); + C::store_chunk(dst, chunk); - // this will just repeat this value many times - let element = self.buf[current - 1]; - let b = unsafe { element.assume_init() }; - - if safe_to_chunk { - let chunk = unsafe { std::arch::x86_64::_mm_set1_epi8(b as i8) }; - for d in self.buf[current..][..length].chunks_mut(16) { - unsafe { - _mm_storeu_si128(d.as_mut_ptr().cast(), chunk); - } - } - } else { - self.buf[current..][..length].fill(element); - } - } else { - for i in 0..length { - self.buf[current + i] = self.buf[start + i]; - } + dst = dst.add(core::mem::size_of::()); } - } else { - let (before, after) = self.buf.split_at_mut(current); - if safe_to_chunk { - for (s, d) in before[start..end].chunks(16).zip(after.chunks_mut(16)) { - use std::arch::x86_64::{_mm_loadu_si128, _mm_storeu_si128}; - - unsafe { - let chunk = _mm_loadu_si128(s.as_ptr().cast()); - _mm_storeu_si128(d.as_mut_ptr().cast(), chunk); - } - } - } else { - // a full simd copy does not fit in the output buffer - self.buf.copy_within(start..end, current); - } + let remainder = it.remainder(); + std::ptr::copy_nonoverlapping(remainder.as_ptr().cast(), dst, remainder.len()) } - - // safety: we just copied length initialized bytes right beyond self.filled - unsafe { self.assume_init(length) }; - - self.advance(length); } - #[cfg(target_arch = "x86_64")] - pub fn copy_match_avx2(&mut self, offset_from_end: usize, length: usize) { - let current = self.filled; - - let start = current.checked_sub(offset_from_end).expect("in bounds"); - let end = start.checked_add(length).expect("in bounds"); - - let safe_to_chunk = (current + length).next_multiple_of(32) <= self.buf.len(); - - if end > self.filled { - if offset_from_end == 1 { - // this will just repeat this value many times - let element = self.buf[current - 1]; - let b = unsafe { element.assume_init() }; - - self.buf[current..][..length].fill(element); - } else { - for i in 0..length { - self.buf[current + i] = self.buf[start + i]; - } + #[inline(always)] + fn copy_chunked_within( + buf: &mut [MaybeUninit], + current: usize, + start: usize, + end: usize, + ) { + if (end - start).next_multiple_of(core::mem::size_of::()) <= (buf.len() - current) { + unsafe { + Self::copy_chunk_unchecked::( + buf.as_ptr().add(start), + buf.as_mut_ptr().add(current), + buf.as_ptr().add(end), + ) } } else { - Avx2::copy_chunk(self.buf, current, start, end) + // a full simd copy does not fit in the output buffer + buf.copy_within(start..end, current); } + } - // safety: we just copied length initialized bytes right beyond self.filled - unsafe { self.assume_init(length) }; - - self.advance(length); + /// # Safety + /// + /// `src` must be safe to perform unaligned reads in `core::mem::size_of::()` chunks until + /// `end` is reached. `dst` must be safe to (unalingned) write that number of chunks. + #[inline(always)] + unsafe fn copy_chunk_unchecked( + mut src: *const MaybeUninit, + mut dst: *mut MaybeUninit, + end: *const MaybeUninit, + ) { + while src < end { + let chunk = C::load_chunk(src); + C::store_chunk(dst, chunk); + + src = src.add(core::mem::size_of::()); + dst = dst.add(core::mem::size_of::()); + } } } @@ -536,6 +505,14 @@ unsafe fn slice_assume_init_mut(slice: &mut [MaybeUninit]) -> &mut [u8] { &mut *(slice as *mut [MaybeUninit] as *mut [u8]) } +trait Chunk { + /// Safety: must be valid to read a `Self::Chunk` value from `from` with an unaligned read. + unsafe fn load_chunk(from: *const MaybeUninit) -> Self; + + /// Safety: must be valid to write a `Self::Chunk` value to `out` with an unaligned write. + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self); +} + trait ChunkCopy { const N: usize = core::mem::size_of::(); @@ -546,62 +523,62 @@ trait ChunkCopy { /// Safety: must be valid to write a `Self::Chunk` value to `out` with an unaligned write. unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk); +} - #[inline(always)] - fn copy_chunk(buf: &mut [MaybeUninit], current: usize, start: usize, end: usize) { - let (before, after) = buf.split_at_mut(current); +impl Chunk for u64 { + unsafe fn load_chunk(from: *const MaybeUninit) -> Self { + std::ptr::read_unaligned(from.cast()) + } - if (end - start).next_multiple_of(32) <= after.len() { - let src = &before[start..end]; - let dst = after; - unsafe { Self::copy_chunk_unchecked(src, dst) } - } else { - // a full simd copy does not fit in the output buffer - buf.copy_within(start..end, current); - } + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { + std::ptr::copy_nonoverlapping( + chunk.to_ne_bytes().as_ptr().cast(), + out, + core::mem::size_of::(), + ) } +} - /// # Safety - /// - /// - src.len().div_ceil(Self::N) >= dst.div_ceil(Self::N) +impl Chunk for core::arch::x86_64::__m128i { #[inline(always)] - unsafe fn copy_chunk_unchecked(src: &[MaybeUninit], dst: &mut [MaybeUninit]) { - // if this condition is false, the final simd write will go out of bounds - debug_assert!(src.len().div_ceil(Self::N) >= dst.len().div_ceil(Self::N)); + unsafe fn load_chunk(from: *const MaybeUninit) -> Self { + core::arch::x86_64::_mm_loadu_si128(from.cast()) + } - for (s, d) in src.chunks(Self::N).zip(dst.chunks_mut(Self::N)) { - let chunk = Self::load_chunk(s.as_ptr()); - Self::store_chunk(d.as_mut_ptr(), chunk); - } + #[inline(always)] + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { + core::arch::x86_64::_mm_storeu_si128(out as *mut Self, chunk); } } -struct Generic; - -impl ChunkCopy for Generic { - type Chunk = u64; - - unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk { - std::ptr::read_unaligned(from.cast()) +impl Chunk for core::arch::x86_64::__m256i { + #[inline(always)] + unsafe fn load_chunk(from: *const MaybeUninit) -> Self { + core::arch::x86_64::_mm256_loadu_si256(from.cast()) } - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk) { - std::ptr::copy_nonoverlapping(chunk.to_ne_bytes().as_ptr().cast(), out, Self::N) + #[inline(always)] + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { + core::arch::x86_64::_mm256_storeu_si256(out as *mut Self, chunk); } } -struct Avx2; - -impl ChunkCopy for Avx2 { - type Chunk = core::arch::x86_64::__m256i; - +impl Chunk for core::arch::x86_64::__m512i { #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk { - core::arch::x86_64::_mm256_loadu_si256(from.cast()) + unsafe fn load_chunk(from: *const MaybeUninit) -> Self { + // TODO AVX-512 is effectively unstable. + // We cross our fingers that LLVM optimizes this into a vmovdqu32 + // + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420&ig_expand=4110 + std::ptr::read_unaligned(from.cast()) } #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk) { - core::arch::x86_64::_mm256_storeu_si256(out as *mut Self::Chunk, chunk); + unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { + // TODO AVX-512 is effectively unstable. + // We cross our fingers that LLVM optimizes this into a vmovdqu32 + // + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=3420&ig_expand=4110,6550 + std::ptr::write_unaligned(out.cast(), chunk) } } From 89f1afc788d3d746a39e624871388bfc8d377d05 Mon Sep 17 00:00:00 2001 From: Folkert Date: Tue, 13 Feb 2024 17:36:23 +0100 Subject: [PATCH 4/5] remove simd code that did not have any gainz --- zlib-rs/src/read_buf.rs | 93 ++++------------------------------------- 1 file changed, 7 insertions(+), 86 deletions(-) diff --git a/zlib-rs/src/read_buf.rs b/zlib-rs/src/read_buf.rs index 17440efc..5895bf71 100644 --- a/zlib-rs/src/read_buf.rs +++ b/zlib-rs/src/read_buf.rs @@ -295,36 +295,8 @@ impl<'a> ReadBuf<'a> { "buf.len() must fit in remaining()" ); - 'blk: { - #[cfg(target_arch = "x86_64")] - if std::is_x86_feature_detected!("avx512f") { - break 'blk Self::copy_chunked_from_slice::( - self.buf, - self.filled, - buf, - ); - } - - #[cfg(target_arch = "x86_64")] - if std::is_x86_feature_detected!("avx2") { - break 'blk Self::copy_chunked_from_slice::( - self.buf, - self.filled, - buf, - ); - } - - #[cfg(target_arch = "x86_64")] - if std::is_x86_feature_detected!("sse") { - break 'blk Self::copy_chunked_from_slice::( - self.buf, - self.filled, - buf, - ); - } - - Self::copy_chunked_from_slice::(self.buf, self.filled, buf) - }; + // using simd here (on x86_64) was not fruitful + self.buf[self.filled..][..buf.len()].copy_from_slice(slice_to_uninit(buf)); let end = self.filled + buf.len(); self.initialized = Ord::max(self.initialized, end); @@ -350,7 +322,7 @@ impl<'a> ReadBuf<'a> { return self.copy_match_help::(offset_from_end, length); } - return self.copy_match_help::(offset_from_end, length); + self.copy_match_help::(offset_from_end, length) } fn copy_match_help(&mut self, offset_from_end: usize, length: usize) { @@ -379,27 +351,6 @@ impl<'a> ReadBuf<'a> { self.advance(length); } - #[inline(always)] - fn copy_chunked_from_slice(buf: &mut [MaybeUninit], current: usize, src: &[u8]) { - assert!(buf.len() - current > src.len()); - - let mut it = src.chunks_exact(core::mem::size_of::()); - - unsafe { - let mut dst = buf.as_mut_ptr().add(current); - - for c in &mut it { - let chunk = C::load_chunk(c.as_ptr().cast()); - C::store_chunk(dst, chunk); - - dst = dst.add(core::mem::size_of::()); - } - - let remainder = it.remainder(); - std::ptr::copy_nonoverlapping(remainder.as_ptr().cast(), dst, remainder.len()) - } - } - #[inline(always)] fn copy_chunked_within( buf: &mut [MaybeUninit], @@ -441,28 +392,6 @@ impl<'a> ReadBuf<'a> { } } -fn slice_as_chunks(slice: &[T]) -> (&[[T; N]], &[T]) { - assert!(N != 0, "chunk size must be non-zero"); - let len = slice.len() / N; - let (multiple_of_n, remainder) = slice.split_at(len * N); - // SAFETY: We already panicked for zero, and ensured by construction - // that the length of the subslice is a multiple of N. - let array_slice: &[[T; N]] = - unsafe { std::slice::from_raw_parts(multiple_of_n.as_ptr().cast(), len) }; - (array_slice, remainder) -} - -fn slice_as_chunks_mut(slice: &mut [T]) -> (&mut [[T; N]], &mut [T]) { - assert!(N != 0, "chunk size must be non-zero"); - let len = slice.len() / N; - let (multiple_of_n, remainder) = slice.split_at_mut(len * N); - // SAFETY: We already panicked for zero, and ensured by construction - // that the length of the subslice is a multiple of N. - let array_slice: &mut [[T; N]] = - unsafe { std::slice::from_raw_parts_mut(multiple_of_n.as_mut_ptr().cast(), len) }; - (array_slice, remainder) -} - impl std::io::Write for ReadBuf<'_> { fn write(&mut self, buf: &[u8]) -> std::io::Result { if self.remaining() < buf.len() { @@ -491,6 +420,10 @@ impl fmt::Debug for ReadBuf<'_> { } } +fn slice_to_uninit(slice: &[u8]) -> &[MaybeUninit] { + unsafe { &*(slice as *const [u8] as *const [MaybeUninit]) } +} + unsafe fn slice_to_uninit_mut(slice: &mut [u8]) -> &mut [MaybeUninit] { &mut *(slice as *mut [u8] as *mut [MaybeUninit]) } @@ -513,18 +446,6 @@ trait Chunk { unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self); } -trait ChunkCopy { - const N: usize = core::mem::size_of::(); - - type Chunk; - - /// Safety: must be valid to read a `Self::Chunk` value from `from` with an unaligned read. - unsafe fn load_chunk(from: *const MaybeUninit) -> Self::Chunk; - - /// Safety: must be valid to write a `Self::Chunk` value to `out` with an unaligned write. - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self::Chunk); -} - impl Chunk for u64 { unsafe fn load_chunk(from: *const MaybeUninit) -> Self { std::ptr::read_unaligned(from.cast()) From 57dffae8c9452b8280080f4ba568beaca3763da1 Mon Sep 17 00:00:00 2001 From: Folkert Date: Tue, 13 Feb 2024 17:43:19 +0100 Subject: [PATCH 5/5] add comment back --- zlib-rs/src/read_buf.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zlib-rs/src/read_buf.rs b/zlib-rs/src/read_buf.rs index 5895bf71..ae5fdd1c 100644 --- a/zlib-rs/src/read_buf.rs +++ b/zlib-rs/src/read_buf.rs @@ -331,6 +331,11 @@ impl<'a> ReadBuf<'a> { let start = current.checked_sub(offset_from_end).expect("in bounds"); let end = start.checked_add(length).expect("in bounds"); + // Note also that the referenced string may overlap the current + // position; for example, if the last 2 bytes decoded have values + // X and Y, a string reference with + // adds X,Y,X,Y,X to the output stream. + if end > current { if offset_from_end == 1 { // this will just repeat this value many times