diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 74b14e526..6796f63eb 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -56,6 +56,7 @@ RUN apt install -y \
xvfb \
gdb \
protobuf-compiler \
+ protobuf-compiler protobuf-compiler-grpc \
bear \
libzmq3-dev \
rr
diff --git a/.gitignore b/.gitignore
index 8f7d7a5fb..76049fcd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,8 @@
### R ###
.RData
.Rhistory
+external/R/doc/manual/*.html
+external/R/doc/NEWS.pdf
..Rcheck
### C ###
diff --git a/.idea/code.iml b/.idea/code.iml
deleted file mode 100644
index d6ebd4805..000000000
--- a/.idea/code.iml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 8b9dc5d07..09665c35f 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -4,6 +4,7 @@
@@ -56,7 +57,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 23968dc67..000000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.jqwik-database b/.jqwik-database
deleted file mode 100644
index 711006c3d..000000000
Binary files a/.jqwik-database and /dev/null differ
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8f2b7113d..930796ee5 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,77 @@
{
- "java.compile.nullAnalysis.mode": "disabled"
+ "java.compile.nullAnalysis.mode": "disabled",
+ "files.associations": {
+ "array": "cpp",
+ "atomic": "cpp",
+ "bit": "cpp",
+ "*.tcc": "cpp",
+ "bitset": "cpp",
+ "cctype": "cpp",
+ "charconv": "cpp",
+ "chrono": "cpp",
+ "cinttypes": "cpp",
+ "clocale": "cpp",
+ "cmath": "cpp",
+ "compare": "cpp",
+ "complex": "cpp",
+ "concepts": "cpp",
+ "condition_variable": "cpp",
+ "cstdarg": "cpp",
+ "cstddef": "cpp",
+ "cstdint": "cpp",
+ "cstdio": "cpp",
+ "cstdlib": "cpp",
+ "cstring": "cpp",
+ "ctime": "cpp",
+ "cwchar": "cpp",
+ "cwctype": "cpp",
+ "deque": "cpp",
+ "list": "cpp",
+ "map": "cpp",
+ "set": "cpp",
+ "string": "cpp",
+ "unordered_map": "cpp",
+ "unordered_set": "cpp",
+ "vector": "cpp",
+ "exception": "cpp",
+ "algorithm": "cpp",
+ "functional": "cpp",
+ "iterator": "cpp",
+ "memory": "cpp",
+ "memory_resource": "cpp",
+ "numeric": "cpp",
+ "optional": "cpp",
+ "random": "cpp",
+ "ratio": "cpp",
+ "string_view": "cpp",
+ "system_error": "cpp",
+ "tuple": "cpp",
+ "type_traits": "cpp",
+ "utility": "cpp",
+ "format": "cpp",
+ "future": "cpp",
+ "initializer_list": "cpp",
+ "iomanip": "cpp",
+ "iosfwd": "cpp",
+ "iostream": "cpp",
+ "istream": "cpp",
+ "limits": "cpp",
+ "mutex": "cpp",
+ "new": "cpp",
+ "numbers": "cpp",
+ "ostream": "cpp",
+ "semaphore": "cpp",
+ "shared_mutex": "cpp",
+ "span": "cpp",
+ "sstream": "cpp",
+ "stdexcept": "cpp",
+ "stop_token": "cpp",
+ "streambuf": "cpp",
+ "thread": "cpp",
+ "cfenv": "cpp",
+ "typeinfo": "cpp",
+ "variant": "cpp",
+ "text_encoding": "cpp",
+ "expected": "cpp"
+ }
}
\ No newline at end of file
diff --git a/README.md b/README.md
index 66da8e329..6aaff9067 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,8 @@ If the dev container is too slow, you can also setup on the host machine. To do
- **Solution:** ensure you have a Java 22 JDK installed, then run `JAVA_HOME= mvn …`
- e.g. if using IntelliJ on macOS, openJDK 22, set `JAVA_HOME=~/Library/Java/JavaVirtualMachines/openjdk-22/Contents/Home`
- In the devcontainer, `JAVA_HOME=/usr/lib/jvm/jdk`
+- **Problem:** some R symbols are not visible when JIT-compiling.
+ - **Solution:** make sure that `external/R` is up-to-date with `git submodule update --init --recursive` and check if it is the right branch, `RSH-4-3-2`.
If you have a different issue than the above, [report it on GitHub](https://github.com/PRL-PRG/r-compile-server/issues/new/choose).
diff --git a/client/protocol/Makefile b/client/protocol/Makefile
deleted file mode 100644
index 9ba7e67dc..000000000
--- a/client/protocol/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-CPP_DEST := ../rsh/src
-JAVA_DEST := ../../server/src/main/java
-
-all:
- protoc --cpp_out=$(CPP_DEST) --java_out=$(JAVA_DEST) protocol.proto
diff --git a/client/protocol/protocol.proto b/client/protocol/protocol.proto
deleted file mode 100644
index 48a03de62..000000000
--- a/client/protocol/protocol.proto
+++ /dev/null
@@ -1,43 +0,0 @@
-syntax = "proto3";
-
-option java_multiple_files = true;
-option java_package = "org.prlprg.server.protocol";
-
-package rsh.server.protocol;
-
-message Request {
- oneof payload {
- HandshakeRequest handshake = 1;
- CompileRequest compile = 2;
- }
-}
-
-message HandshakeRequest {
- string Rsh_version = 1;
- string R_version = 2;
- string platform = 3;
- repeated string packages = 4;
-}
-
-message CompileRequest {
- string name = 2;
- // the closure to be compiled as CLOSXP
- bytes closure = 3;
- uint32 bc_optimization = 4;
- uint32 cc_optimization = 5;
-}
-
-message CompiledFunction {
- string name = 2;
- // content of the object file as spilled by the compiler
- bytes native_code = 3;
- // the constants used by the native code as VECSXP
- bytes constants = 4;
-}
-
-message CompileResponse {
- oneof data {
- string failure = 2;
- CompiledFunction result = 3;
- }
-}
diff --git a/client/rsh/Makefile b/client/rsh/Makefile
index 0f6ac8196..7b1d039ca 100644
--- a/client/rsh/Makefile
+++ b/client/rsh/Makefile
@@ -83,3 +83,5 @@ benchmark:
fi; \
done
+test6:
+ $(LLVM_R) -f test6.R
\ No newline at end of file
diff --git a/client/rsh/NAMESPACE b/client/rsh/NAMESPACE
index 669b869c5..72b0b703b 100644
--- a/client/rsh/NAMESPACE
+++ b/client/rsh/NAMESPACE
@@ -1,5 +1,6 @@
export(rsh_cmpfun)
export(rsh_compile)
+export(init_client)
export(rsh_jit_disable)
export(rsh_jit_enable)
export(is_compiled)
diff --git a/client/rsh/R/rsh.R b/client/rsh/R/rsh.R
index 41d19a7ff..3aa7f09a2 100644
--- a/client/rsh/R/rsh.R
+++ b/client/rsh/R/rsh.R
@@ -6,6 +6,7 @@ NULL
# save the original compiler::cmpfun
.gnur_cmpfun <- compiler::cmpfun
+
# Because of the ORC JIT we need all the native symbols registered globally
# (as RTLD_GLOBAL) so the ORC linker can find them. Unfortunatelly, R does
# not provide a way to instruct the namespace loader to load pass the
@@ -31,6 +32,19 @@ NULL
)
.Call(C_initialize)
+
+ # for the client, so that it is not GC-ed
+ env[[".rsh_client"]] <- init_client("0.0.0.0", 8980L)
+}
+
+#' Initialize the Rsh client
+#'
+#' @param address IP address of the server
+#' @param port port of the server
+#' @export
+init_client <- function(address="0.0.0.0", port=8980L) {
+ .rsh_client <- .Call(C_init_client, address, port, installed.packages()[,1])
+ .rsh_client
}
#' Activate the Rsh JIT
@@ -71,6 +85,10 @@ rsh_compile <- function(f, options) {
options$inplace <- TRUE
}
+ if(is.null(options$tier)) {
+ options$tier <- "optimized"
+ }
+
invisible(.Call(C_compile, f, options))
}
@@ -118,3 +136,11 @@ rsh_override_cmpfun <- function(f) {
lockBinding("cmpfun", compiler_ns)
}
+#' Get the total size of the messages sent
+#' and received by the server, in bytes
+#' @return integer vector of size 2, the first element is the total size of requests,
+#' and the second element is the total size of responses
+#' @export
+rsh_total_size <- function() {
+ .Call(C_get_total_size)
+}
\ No newline at end of file
diff --git a/client/rsh/inst/xxhash.hpp b/client/rsh/inst/xxhash.hpp
new file mode 100644
index 000000000..886454841
--- /dev/null
+++ b/client/rsh/inst/xxhash.hpp
@@ -0,0 +1,2226 @@
+#pragma once
+#include
+#include
+#include
+#include
+#include
+#include
+
+/*
+xxHash - Extremely Fast Hash algorithm
+Header File
+Copyright (C) 2012-2024, Yann Collet.
+Copyright (C) 2017-2024, Red Gavin.
+All rights reserved.
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+- xxHash C++ port repository : https://github.com/RedSpah/xxhash_cpp
+*/
+
+/* Intrinsics
+* Sadly has to be included in the global namespace or literally everything breaks
+*/
+#if (defined(__ARM_NEON) && defined(__APPLE__))
+#include "sse2neon.h"
+#else
+#include
+#endif
+
+namespace xxh
+{
+ /* *************************************
+ * Versioning
+ ***************************************/
+
+ namespace version
+ {
+ constexpr int cpp_version_major = 0;
+ constexpr int cpp_version_minor = 8;
+ constexpr int cpp_version_release = 1;
+ }
+
+ constexpr uint32_t version_number()
+ {
+ return version::cpp_version_major * 10000 + version::cpp_version_minor * 100 + version::cpp_version_release;
+ }
+
+
+ /* *************************************
+ * Basic Types - Predefining uint128_t for intrin
+ ***************************************/
+
+ namespace typedefs
+ {
+ struct alignas(16) uint128_t
+ {
+ uint64_t low64 = 0;
+ uint64_t high64 = 0;
+
+ bool operator==(const uint128_t & other)
+ {
+ return (low64 == other.low64 && high64 == other.high64);
+ }
+
+ bool operator>(const uint128_t & other)
+ {
+ return (high64 > other.high64 || low64 > other.low64);
+ }
+
+ bool operator>=(const uint128_t & other)
+ {
+ return (*this > other || *this == other);
+ }
+
+ bool operator<(const uint128_t & other)
+ {
+ return !(*this >= other);
+ }
+
+ bool operator<=(const uint128_t & other)
+ {
+ return !(*this > other);
+ }
+
+ bool operator!=(const uint128_t & other)
+ {
+ return !(*this == other);
+ }
+
+ uint128_t(uint64_t low, uint64_t high) : low64(low), high64(high) {}
+
+ uint128_t() {}
+ };
+
+ }
+
+ using uint128_t = typedefs::uint128_t;
+
+
+ /* *************************************
+ * Compiler / Platform Specific Features
+ ***************************************/
+
+ namespace intrin
+ {
+ /*!XXH_CPU_LITTLE_ENDIAN :
+ * This is a CPU endian detection macro, will be
+ * automatically set to 1 (little endian) if it is left undefined.
+ * If compiling for a big endian system (why), XXH_CPU_LITTLE_ENDIAN has to be explicitly defined as 0.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#endif
+
+
+ /* Vectorization Detection
+ * NOTE: XXH_NEON and XXH_VSX aren't supported in this C++ port.
+ * The primary reason is that I don't have access to an ARM and PowerPC
+ * machines to test them, and the secondary reason is that I even doubt anyone writing
+ * code for such machines would bother using a C++ port rather than the original C version.
+ */
+#ifndef XXH_VECTOR /* can be predefined on command line */
+# if defined(__AVX512F__)
+# define XXH_VECTOR 3 /* AVX512 for Skylake and Icelake */
+# elif defined(__AVX2__)
+# define XXH_VECTOR 2 /* AVX2 for Haswell and Bulldozer */
+# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+# define XXH_VECTOR 1 /* SSE2 for Pentium 4 and all x86_64 */
+# else
+# define XXH_VECTOR 0 /* Portable scalar version */
+# endif
+#endif
+
+ constexpr int vector_mode = XXH_VECTOR;
+
+#if XXH_VECTOR == 3 /* AVX512 for Skylake and Icelake */
+ constexpr int acc_align = 64;
+ using avx512_underlying = __m512i;
+ using avx2_underlying = __m256i;
+ using sse2_underlying = __m128i;
+#elif XXH_VECTOR == 2 /* AVX2 for Haswell and Bulldozer */
+ constexpr int acc_align = 32;
+ using avx512_underlying = void;
+ using avx2_underlying = __m256i;
+ using sse2_underlying = __m128i;
+#elif XXH_VECTOR == 1 /* SSE2 for Pentium 4 and all x86_64 */
+ using avx512_underlying = void;
+ using avx2_underlying = void; //std::array<__m128i, 2>;
+ using sse2_underlying = __m128i;
+ constexpr int acc_align = 16;
+#else /* Portable scalar version */
+ using avx512_underlying = void;
+ using avx2_underlying = void; //std::array;
+ using sse2_underlying = void; //std::array;
+ constexpr int acc_align = 8;
+#endif
+
+
+ /* Compiler Specifics
+ * Defines inline macros and includes specific compiler's instrinsics.
+ * */
+#ifdef XXH_FORCE_INLINE /* First undefining the symbols in case they're already defined */
+# undef XXH_FORCE_INLINE
+#endif
+#ifdef XXH_NO_INLINE
+# undef XXH_NO_INLINE
+#endif
+
+#ifdef _MSC_VER /* Visual Studio */
+# pragma warning(disable : 4127)
+# define XXH_FORCE_INLINE static __forceinline
+# define XXH_NO_INLINE static __declspec(noinline)
+# include
+#elif defined(__GNUC__) /* Clang / GCC */
+# define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+# define XXH_NO_INLINE static __attribute__((noinline))
+#if (defined(__ARM_NEON) && defined(__APPLE__))
+# include "sse2neon.h"
+# else
+# include
+# endif
+#else
+# define XXH_FORCE_INLINE static inline
+# define XXH_NO_INLINE static
+#endif
+
+
+ /* Prefetch
+ * Can be disabled by defining XXH_NO_PREFETCH
+ */
+#if defined(XXH_NO_PREFETCH)
+ XXH_FORCE_INLINE void prefetch(const void* ptr) {}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+ XXH_FORCE_INLINE void prefetch(const void* ptr) { _mm_prefetch((const char*)(ptr), _MM_HINT_T0); }
+#elif defined(__GNUC__)
+ XXH_FORCE_INLINE void prefetch(const void* ptr) { __builtin_prefetch((ptr), 0, 3); }
+#else
+ XXH_FORCE_INLINE void prefetch(const void* ptr) {}
+#endif
+
+
+ /* Restrict
+ * Defines macro for restrict, which in C++ is sadly just a compiler extension (for now).
+ * Can be disabled by defining XXH_NO_RESTRICT
+ */
+#ifdef XXH_RESTRICT
+# undef XXH_RESTRICT
+#endif
+
+#if (defined(__GNUC__) || defined(_MSC_VER)) && defined(__cplusplus) && !defined(XXH_NO_RESTRICT)
+# define XXH_RESTRICT __restrict
+#else
+# define XXH_RESTRICT
+#endif
+
+
+ /* Likely / Unlikely
+ * Defines macros for Likely / Unlikely, which are official in C++20, but sadly this library aims the previous standard.
+ * Not present on MSVC.
+ * Can be disabled by defining XXH_NO_BRANCH_HINTS
+ */
+#if ((defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)) && !defined(XXH_NO_BRANCH_HINTS)
+# define XXH_likely(x) __builtin_expect(x, 1)
+# define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+# define XXH_likely(x) (x)
+# define XXH_unlikely(x) (x)
+#endif
+
+
+ namespace bit_ops
+ {
+#if defined(_MSC_VER)
+ static inline uint32_t rotl32(uint32_t x, int32_t r) { return _rotl(x, r); }
+ static inline uint64_t rotl64(uint64_t x, int32_t r) { return _rotl64(x, r); }
+ static inline uint32_t rotr32(uint32_t x, int32_t r) { return _rotr(x, r); }
+ static inline uint64_t rotr64(uint64_t x, int32_t r) { return _rotr64(x, r); }
+#else
+ static inline uint32_t rotl32(uint32_t x, int32_t r) { return ((x << r) | (x >> (32 - r))); }
+ static inline uint64_t rotl64(uint64_t x, int32_t r) { return ((x << r) | (x >> (64 - r))); }
+ static inline uint32_t rotr32(uint32_t x, int32_t r) { return ((x >> r) | (x << (32 - r))); }
+ static inline uint64_t rotr64(uint64_t x, int32_t r) { return ((x >> r) | (x << (64 - r))); }
+#endif
+
+
+#if defined(_MSC_VER) /* Visual Studio */
+ static inline uint32_t swap32(uint32_t x) { return _byteswap_ulong(x); }
+ static inline uint64_t swap64(uint64_t x) { return _byteswap_uint64(x); }
+#elif defined(__GNUC__)
+ static inline uint32_t swap32(uint32_t x) { return __builtin_bswap32(x); }
+ static inline uint64_t swap64(uint64_t x) { return __builtin_bswap64(x); }
+#else
+ static inline uint32_t swap32(uint32_t x) { return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); }
+ static inline uint64_t swap64(uint64_t x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); }
+#endif
+
+
+#if defined(_MSC_VER) && defined(_M_IX86) // Only for 32-bit MSVC.
+ XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return __emulu(x, y); }
+#else
+ XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return (uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y); }
+#endif
+
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+ __attribute__((__target__("no-sse")))
+#endif
+ static inline uint128_t mult64to128(uint64_t lhs, uint64_t rhs)
+ {
+
+#if defined(__GNUC__) && !defined(__wasm__) \
+ && defined(__SIZEOF_INT128__) \
+ || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+ __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+ uint128_t r128;
+ r128.low64 = (uint64_t)(product);
+ r128.high64 = (uint64_t)(product >> 64);
+ return r128;
+
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+# pragma intrinsic(_umul128)
+#endif
+ uint64_t product_high;
+ uint64_t const product_low = _umul128(lhs, rhs, &product_high);
+ uint128_t r128;
+ r128.low64 = product_low;
+ r128.high64 = product_high;
+ return r128;
+
+#else
+ uint64_t const lo_lo = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+ uint64_t const hi_lo = bit_ops::mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
+ uint64_t const lo_hi = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+ uint64_t const hi_hi = bit_ops::mult32to64(lhs >> 32, rhs >> 32);
+
+ /* Now add the products together. These will never overflow. */
+ uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+ uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
+ uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+ uint128_t r128;
+ r128.low64 = lower;
+ r128.high64 = upper;
+ return r128;
+#endif
+ }
+ }
+ }
+
+
+ /* *************************************
+ * Basic Types - Everything else
+ ***************************************/
+
+ namespace typedefs
+ {
+ /* *************************************
+ * Basic Types - Detail
+ ***************************************/
+
+ template
+ struct hash_type
+ {
+ using type = void;
+ };
+
+ template <>
+ struct hash_type<32>
+ {
+ using type = uint32_t;
+ };
+
+ template <>
+ struct hash_type<64>
+ {
+ using type = uint64_t;
+ };
+
+ template <>
+ struct hash_type<128>
+ {
+ using type = uint128_t;
+ };
+
+
+ template
+ struct vec_type
+ {
+ using type = void;
+ };
+
+ template <>
+ struct vec_type<64>
+ {
+ using type = uint64_t;
+ };
+
+ template <>
+ struct vec_type<128>
+ {
+ using type = intrin::sse2_underlying;
+ };
+
+ template <>
+ struct vec_type<256>
+ {
+ using type = intrin::avx2_underlying;
+ };
+
+ template <>
+ struct vec_type<512>
+ {
+ using type = intrin::avx512_underlying;
+ };
+
+ /* Rationale
+ * On the surface level uint_type appears to be pointless,
+ * as it is just a copy of hash_type. They do use the same types,
+ * that is true, but the reasoning for the difference is aimed at humans,
+ * not the compiler, as a difference between values that are 'just' numbers,
+ * and those that represent actual hash values.
+ */
+ template
+ struct uint_type
+ {
+ using type = void;
+ };
+
+ template <>
+ struct uint_type<32>
+ {
+ using type = uint32_t;
+ };
+
+ template <>
+ struct uint_type<64>
+ {
+ using type = uint64_t;
+ };
+
+ template <>
+ struct uint_type<128>
+ {
+ using type = uint128_t;
+ };
+ }
+
+ template
+ using hash_t = typename typedefs::hash_type::type;
+ using hash32_t = hash_t<32>;
+ using hash64_t = hash_t<64>;
+ using hash128_t = hash_t<128>;
+
+ template
+ using vec_t = typename typedefs::vec_type::type;
+ using vec64_t = vec_t<64>;
+ using vec128_t = vec_t<128>;
+ using vec256_t = vec_t<256>;
+ using vec512_t = vec_t<512>;
+
+ template
+ using uint_t = typename typedefs::uint_type::type;
+
+
+
+ /* *************************************
+ * Bit Operations
+ ***************************************/
+
+ namespace bit_ops
+ {
+ /* ****************************************
+ * Bit Operations
+ ******************************************/
+
+ template
+ static inline uint_t rotl(uint_t n, int32_t r)
+ {
+ if constexpr (N == 32)
+ {
+ return intrin::bit_ops::rotl32(n, r);
+ }
+
+ if constexpr (N == 64)
+ {
+ return intrin::bit_ops::rotl64(n, r);
+ }
+ }
+
+ template
+ static inline uint_t rotr(uint_t n, int32_t r)
+ {
+ if constexpr (N == 32)
+ {
+ return intrin::bit_ops::rotr32(n, r);
+ }
+
+ if constexpr (N == 64)
+ {
+ return intrin::bit_ops::rotr64(n, r);
+ }
+ }
+
+ template
+ static inline uint_t swap(uint_t n)
+ {
+ if constexpr (N == 32)
+ {
+ return intrin::bit_ops::swap32(n);
+ }
+
+ if constexpr (N == 64)
+ {
+ return intrin::bit_ops::swap64(n);
+ }
+ }
+
+ template
+ static inline vec_t mul32to64(vec_t x, vec_t y)
+ {
+ if constexpr (N == 64)
+ {
+ return intrin::bit_ops::mult32to64(static_cast(x), static_cast(y));
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+ static inline uint128_t mul64to128(uint64_t x, uint64_t y)
+ {
+ return intrin::bit_ops::mult64to128(x, y);
+ }
+
+ static inline uint64_t mul128fold64(uint64_t x, uint64_t y)
+ {
+ uint128_t product = mul64to128(x, y);
+
+ return (product.low64 ^ product.high64);
+ }
+ }
+
+
+ /* *************************************
+ * Memory Functions
+ ***************************************/
+
+ namespace mem_ops
+ {
+
+ /* *************************************
+ * Endianness
+ ***************************************/
+
+ constexpr bool is_little_endian()
+ {
+ return (XXH_CPU_LITTLE_ENDIAN == 1);
+ }
+
+
+ /* *************************************
+ * Memory Access
+ ***************************************/
+
+ template
+ static inline uint_t read(const void* memPtr)
+ {
+ uint_t val;
+
+ memcpy(&val, memPtr, sizeof(val));
+ return val;
+ }
+
+ template
+ static inline uint_t readLE(const void* ptr)
+ {
+ if constexpr (is_little_endian())
+ {
+ return read(ptr);
+ }
+ else
+ {
+ return bit_ops::swap(read(ptr));
+ }
+ }
+
+ template
+ static inline uint_t readBE(const void* ptr)
+ {
+ if constexpr (is_little_endian())
+ {
+ return bit_ops::swap(read(ptr));
+ }
+ else
+ {
+ return read(ptr);
+ }
+ }
+
+ template
+ static void writeLE(void* dst, uint_t v)
+ {
+ if constexpr (!is_little_endian())
+ {
+ v = bit_ops::swap(v);
+ }
+
+ memcpy(dst, &v, sizeof(v));
+ }
+ }
+
+
+ /* *************************************
+ * Vector Functions
+ ***************************************/
+
+ namespace vec_ops
+ {
+ template
+ XXH_FORCE_INLINE vec_t loadu(const vec_t* input)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid template argument passed to xxh::vec_ops::loadu");
+
+ if constexpr (N == 128)
+ {
+ return _mm_loadu_si128(input);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_loadu_si256(input);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_loadu_si512(input);
+ }
+
+ if constexpr (N == 64)
+ {
+ return mem_ops::readLE<64>(input);
+ }
+
+ }
+
+
+ // 'xorv' instead of 'xor' because 'xor' is a weird wacky alternate operator expression thing.
+ template
+ XXH_FORCE_INLINE vec_t xorv(vec_t a, vec_t b)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::xorv");
+
+ if constexpr (N == 128)
+ {
+ return _mm_xor_si128(a, b);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_xor_si256(a, b);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_xor_si512(a, b);
+ }
+
+ if constexpr (N == 64)
+ {
+ return a ^ b;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t mul(vec_t a, vec_t b)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::mul");
+
+ if constexpr (N == 128)
+ {
+ return _mm_mul_epu32(a, b);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_mul_epu32(a, b);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_mul_epu32(a, b);
+ }
+
+ if constexpr (N == 64)
+ {
+ return a * b;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t add(vec_t a, vec_t b)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::add");
+
+ if constexpr (N == 128)
+ {
+ return _mm_add_epi64(a, b);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_add_epi64(a, b);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_add_epi64(a, b);
+ }
+
+ if constexpr (N == 64)
+ {
+ return a + b;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t shuffle(vec_t a)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::shuffle");
+
+ if constexpr (N == 128)
+ {
+ return _mm_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+ }
+
+ if constexpr (N == 64)
+ {
+ return a;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t set1(int64_t a)
+ {
+
+#if (defined(__ARM_NEON) && defined(__APPLE__))
+ static_assert(!(N != 128 && N != 64), "Invalid argument passed to xxh::vec_ops::set1");
+#else
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::set1");
+ if constexpr (N == 256)
+ {
+ return _mm256_set1_epi32(static_cast(a));
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_set1_epi32(static_cast(a));
+ }
+#endif
+
+ if constexpr (N == 128)
+ {
+ return _mm_set1_epi32(static_cast(a));
+ }
+
+ if constexpr (N == 64)
+ {
+ return a;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t srli(vec_t n, int a)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::srli");
+
+ if constexpr (N == 128)
+ {
+ return _mm_srli_epi64(n, a);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_srli_epi64(n, a);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_srli_epi64(n, a);
+ }
+
+ if constexpr (N == 64)
+ {
+ return n >> a;
+ }
+ }
+
+
+ template
+ XXH_FORCE_INLINE vec_t slli(vec_t n, int a)
+ {
+ static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::slli");
+
+ if constexpr (N == 128)
+ {
+ return _mm_slli_epi64(n, a);
+ }
+
+ if constexpr (N == 256)
+ {
+ return _mm256_slli_epi64(n, a);
+ }
+
+ if constexpr (N == 512)
+ {
+ return _mm512_slli_epi64(n, a);
+ }
+
+ if constexpr (N == 64)
+ {
+ return n << a;
+ }
+ }
+ }
+
+ /* *************************************
+ * Canonical represenation
+ ***************************************/
+
+ template
+ struct canonical_t
+ {
+ std::array digest{ 0 };
+
+ canonical_t(hash_t hash)
+ {
+ if constexpr (bit_mode < 128)
+ {
+ if (mem_ops::is_little_endian())
+ {
+ hash = bit_ops::swap(hash);
+ }
+
+ memcpy(digest.data(), &hash, sizeof(canonical_t));
+ }
+ else
+ {
+ if (mem_ops::is_little_endian())
+ {
+ hash.low64 = bit_ops::swap<64>(hash.low64);
+ hash.high64 = bit_ops::swap<64>(hash.high64);
+ }
+
+ memcpy(digest.data(), &hash.high64, sizeof(hash.high64));
+ memcpy(digest.data() + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+ }
+ }
+
+ hash_t get_hash() const
+ {
+ if constexpr (bit_mode < 128)
+ {
+ return mem_ops::readBE(&digest);
+ }
+ else
+ {
+ return { mem_ops::readBE<64>(&digest[8]), mem_ops::readBE<64>(&digest) };
+ }
+ }
+ };
+
+ using canonical32_t = canonical_t<32>;
+ using canonical64_t = canonical_t<64>;
+ using canonical128_t = canonical_t<128>;
+
+ template
+ inline hash_t to_canonical(hash_t hash)
+ {
+ static_assert(!(bit_mode != 128 && bit_mode != 64 && bit_mode != 32), "Canonical form can only be obtained from 32, 64 and 128 bit hashes.");
+ canonical_t canon(hash);
+ hash_t res;
+ memcpy(&res, &canon, bit_mode / 8);
+
+ return res;
+ }
+
+
+ /* *************************************
+ * Algorithm Implementation - xxhash
+ ***************************************/
+
+ namespace detail
+ {
+ using namespace mem_ops;
+ using namespace bit_ops;
+
+
+ /* *************************************
+ * Constants
+ ***************************************/
+
+ constexpr static std::array primes32 = { 2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U };
+ constexpr static std::array primes64 = { 11400714785074694791ULL, 14029467366897019727ULL, 1609587929392839161ULL, 9650029242287828579ULL, 2870177450012600261ULL };
+
+ template
+ constexpr uint_t PRIME(uint64_t n)
+ {
+ if constexpr (N == 32)
+ {
+ return primes32[n - 1];
+ }
+ else
+ {
+ return primes64[n - 1];
+ }
+ }
+
+
+ /* *************************************
+ * Functions
+ ***************************************/
+
+ template
+ XXH_FORCE_INLINE uint_t avalanche(uint_t hash)
+ {
+ if constexpr (N == 32)
+ {
+ hash ^= hash >> 15;
+ hash *= PRIME<32>(2);
+ hash ^= hash >> 13;
+ hash *= PRIME<32>(3);
+ hash ^= hash >> 16;
+ return hash;
+ }
+ else if constexpr (N == 64)
+ {
+ hash ^= hash >> 33;
+ hash *= PRIME<64>(2);
+ hash ^= hash >> 29;
+ hash *= PRIME<64>(3);
+ hash ^= hash >> 32;
+ return hash;
+ }
+ else return 0;
+ }
+
+ template
+ XXH_FORCE_INLINE uint_t round(uint_t seed, uint_t input)
+ {
+ seed += input * PRIME(2);
+
+ if constexpr (N == 32)
+ {
+ seed = rotl(seed, 13);
+ }
+ else
+ {
+ seed = rotl(seed, 31);
+ }
+
+ seed *= PRIME(1);
+ return seed;
+ }
+
+ XXH_FORCE_INLINE uint64_t mergeRound64(hash64_t acc, uint64_t val)
+ {
+ val = round<64>(0, val);
+ acc ^= val;
+ acc = acc * PRIME<64>(1) + PRIME<64>(4);
+ return acc;
+ }
+
+ XXH_FORCE_INLINE void endian_align_sub_mergeround(hash64_t& hash_ret, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4)
+ {
+ hash_ret = mergeRound64(hash_ret, v1);
+ hash_ret = mergeRound64(hash_ret, v2);
+ hash_ret = mergeRound64(hash_ret, v3);
+ hash_ret = mergeRound64(hash_ret, v4);
+ }
+
+ template
+ static inline hash_t endian_align_sub_ending(hash_t hash_ret, const uint8_t* p, const uint8_t* bEnd)
+ {
+ if constexpr (N == 32)
+ {
+ while ((p + 4) <= bEnd)
+ {
+ hash_ret += readLE<32>(p) * PRIME<32>(3);
+ hash_ret = rotl<32>(hash_ret, 17) * PRIME<32>(4);
+ p += 4;
+ }
+
+ while (p < bEnd)
+ {
+ hash_ret += (*p) * PRIME<32>(5);
+ hash_ret = rotl<32>(hash_ret, 11) * PRIME<32>(1);
+ p++;
+ }
+
+ return avalanche<32>(hash_ret);
+ }
+ else
+ {
+ while (p + 8 <= bEnd)
+ {
+ const uint64_t k1 = round<64>(0, readLE<64>(p));
+
+ hash_ret ^= k1;
+ hash_ret = rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4);
+ p += 8;
+ }
+
+ if (p + 4 <= bEnd)
+ {
+ hash_ret ^= static_cast(readLE<32>(p))* PRIME<64>(1);
+ hash_ret = rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3);
+ p += 4;
+ }
+
+ while (p < bEnd)
+ {
+ hash_ret ^= (*p) * PRIME<64>(5);
+ hash_ret = rotl<64>(hash_ret, 11) * PRIME<64>(1);
+ p++;
+ }
+
+ return avalanche<64>(hash_ret);
+ }
+ }
+
+ template
+ static inline hash_t endian_align(const void* input, size_t len, uint_t seed)
+ {
+ static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode.");
+
+ const uint8_t* p = static_cast(input);
+ const uint8_t* bEnd = p + len;
+ hash_t hash_ret;
+
+ if (len >= (N / 2))
+ {
+ const uint8_t* const limit = bEnd - (N / 2);
+ uint_t v1 = seed + PRIME(1) + PRIME(2);
+ uint_t v2 = seed + PRIME(2);
+ uint_t v3 = seed + 0;
+ uint_t v4 = seed - PRIME(1);
+
+ do
+ {
+ v1 = round(v1, readLE(p));
+ p += (N / 8);
+ v2 = round(v2, readLE(p));
+ p += (N / 8);
+ v3 = round(v3, readLE(p));
+ p += (N / 8);
+ v4 = round(v4, readLE(p));
+ p += (N / 8);
+ }
+ while (p <= limit);
+
+ hash_ret = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18);
+
+ if constexpr (N == 64)
+ {
+ endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4);
+ }
+ }
+ else
+ {
+ hash_ret = seed + PRIME(5);
+ }
+
+ hash_ret += static_cast>(len);
+
+ return endian_align_sub_ending(hash_ret, p, bEnd);
+ }
+ }
+
+
+ /* *************************************
+ * Algorithm Implementation - xxhash3
+ ***************************************/
+
+ namespace detail3
+ {
+ using namespace vec_ops;
+ using namespace detail;
+ using namespace mem_ops;
+ using namespace bit_ops;
+
+
+ /* *************************************
+ * Enums
+ ***************************************/
+
+ enum class vec_mode : uint8_t { scalar = 0, sse2 = 1, avx2 = 2, avx512 = 3 };
+
+
+ /* *************************************
+ * Constants
+ ***************************************/
+
+ constexpr uint64_t secret_default_size = 192;
+ constexpr uint64_t secret_size_min = 136;
+ constexpr uint64_t secret_consume_rate = 8;
+ constexpr uint64_t stripe_len = 64;
+ constexpr uint64_t acc_nb = 8;
+ constexpr uint64_t prefetch_distance = 384;
+ constexpr uint64_t secret_lastacc_start = 7;
+ constexpr uint64_t secret_mergeaccs_start = 11;
+ constexpr uint64_t midsize_max = 240;
+ constexpr uint64_t midsize_startoffset = 3;
+ constexpr uint64_t midsize_lastoffset = 17;
+
+ constexpr vec_mode vector_mode = static_cast(intrin::vector_mode);
+ constexpr uint64_t acc_align = intrin::acc_align;
+ constexpr std::array vector_bit_width { 64, 128, 256, 512 };
+
+
+ /* *************************************
+ * Defaults
+ ***************************************/
+
+ alignas(64) constexpr uint8_t default_secret[secret_default_size] = {
+ 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+ 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+ 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+ 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+ 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+ 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+ 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+ 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+ 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+ 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+ 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+ 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+ };
+
+ constexpr std::array init_acc = { PRIME<32>(3), PRIME<64>(1), PRIME<64>(2), PRIME<64>(3), PRIME<64>(4), PRIME<32>(2), PRIME<64>(5), PRIME<32>(1) };
+
+
+ /* *************************************
+ * Functions
+ ***************************************/
+
+ XXH_FORCE_INLINE hash_t<64> avalanche(hash_t<64> h64)
+ {
+ constexpr uint64_t avalanche_mul_prime = 0x165667919E3779F9ULL;
+
+ h64 ^= h64 >> 37;
+ h64 *= avalanche_mul_prime;
+ h64 ^= h64 >> 32;
+ return h64;
+ }
+
+ XXH_FORCE_INLINE hash_t<64> rrmxmx(hash_t<64> h64, uint64_t len)
+ {
+ h64 ^= rotl<64>(h64, 49) ^ rotl<64>(h64, 24);
+ h64 *= 0x9FB21C651E98DF25ULL;
+ h64 ^= (h64 >> 35) + len;
+ h64 *= 0x9FB21C651E98DF25ULL;
+ h64 ^= (h64 >> 28);
+ return h64;
+ }
+
+ XXH_FORCE_INLINE void combine_16(void* dest, hash128_t h128)
+ {
+ writeLE<64>(dest, readLE<64>(dest) ^ h128.low64);
+ writeLE<64>((uint8_t*)dest + 8, readLE<64>((uint8_t*)dest + 8) ^ h128.high64);
+ }
+
+ XXH_FORCE_INLINE void accumulate_512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret)
+ {
+ constexpr uint64_t bits = vector_bit_width[static_cast(vector_mode)];
+
+ using vec_t = vec_t;
+
+ alignas(sizeof(vec_t)) vec_t* const xacc = static_cast(acc);
+ const vec_t* const xinput = static_cast(input);
+ const vec_t* const xsecret = static_cast(secret);
+
+ for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
+ {
+ vec_t const data_vec = loadu(xinput + i);
+ vec_t const key_vec = loadu(xsecret + i);
+ vec_t const data_key = xorv(data_vec, key_vec);
+ vec_t product = set1(0);
+
+ if constexpr (vector_mode == vec_mode::scalar)
+ {
+ product = mul32to64(srli(slli(data_key, 32),32), srli(data_key, 32));
+ xacc[i ^ 1] = add(xacc[i ^ 1], data_vec);
+ xacc[i] = add(xacc[i], product);
+ }
+ else
+ {
+ vec_t const data_key_lo = shuffle(data_key);
+ product = mul(data_key, data_key_lo);
+
+ vec_t const data_swap = shuffle(data_vec);
+ vec_t const sum = add(xacc[i], data_swap);
+ xacc[i] = add(sum, product);
+ }
+ }
+ }
+
+ XXH_FORCE_INLINE void scramble_acc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+ constexpr uint64_t bits = vector_bit_width[static_cast(vector_mode)];;
+
+ using vec_t = vec_t;
+
+ alignas(sizeof(vec_t)) vec_t* const xacc = (vec_t*)acc;
+ const vec_t* const xsecret = (const vec_t*)secret;
+
+ for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
+ {
+ vec_t const acc_vec = xacc[i];
+ vec_t const shifted = srli(acc_vec, 47);
+ vec_t const data_vec = xorv(acc_vec, shifted);
+ vec_t const key_vec = loadu(xsecret + i);
+ vec_t const data_key = xorv(data_vec, key_vec);
+
+ if constexpr (vector_mode == vec_mode::scalar)
+ {
+ xacc[i] = mul(data_key, set1(PRIME<32>(1)));
+ }
+ else
+ {
+ vec_t const prime32 = set1(PRIME<32>(1));
+ vec_t const data_key_hi = shuffle(data_key);
+ vec_t const prod_lo = mul(data_key, prime32);
+ vec_t const prod_hi = mul(data_key_hi, prime32);
+
+ xacc[i] = add(prod_lo, vec_ops::slli(prod_hi, 32));
+ }
+ }
+ }
+
+ XXH_FORCE_INLINE void accumulate(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, size_t nbStripes)
+ {
+ for (size_t n = 0; n < nbStripes; n++)
+ {
+ const uint8_t* const in = input + n * stripe_len;
+
+ intrin::prefetch(in + prefetch_distance);
+ accumulate_512(acc, in, secret + n * secret_consume_rate);
+ }
+ }
+
+ XXH_FORCE_INLINE void hash_long_internal_loop(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, size_t secretSize)
+ {
+ size_t const nb_rounds = (secretSize - stripe_len) / secret_consume_rate;
+ size_t const block_len = stripe_len * nb_rounds;
+ size_t const nb_blocks = (len-1) / block_len;
+
+ for (size_t n = 0; n < nb_blocks; n++)
+ {
+ accumulate(acc, input + n * block_len, secret, nb_rounds);
+ scramble_acc(acc, secret + secretSize - stripe_len);
+ }
+
+ /* last partial block */
+ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / stripe_len;
+
+ accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
+
+ /* last stripe */
+ const uint8_t* const p = input + len - stripe_len;
+
+ accumulate_512(acc, p, secret + secretSize - stripe_len - secret_lastacc_start);
+ }
+
+ XXH_FORCE_INLINE uint64_t mix_2_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret)
+ {
+ return mul128fold64(acc[0] ^ readLE<64>(secret), acc[1] ^ readLE<64>(secret + 8));
+ }
+
+ XXH_FORCE_INLINE uint64_t merge_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret, uint64_t start)
+ {
+ uint64_t result64 = start;
+
+ result64 += mix_2_accs(acc + 0, secret + 0);
+ result64 += mix_2_accs(acc + 2, secret + 16);
+ result64 += mix_2_accs(acc + 4, secret + 32);
+ result64 += mix_2_accs(acc + 6, secret + 48);
+
+ return avalanche(result64);
+ }
+
+ XXH_FORCE_INLINE void init_custom_secret(uint8_t* customSecret, uint64_t seed)
+ {
+ for (uint64_t i = 0; i < secret_default_size / 16; i++)
+ {
+ writeLE<64>(customSecret + i * 16, readLE<64>(default_secret + i * 16) + seed);
+ writeLE<64>(customSecret + i * 16 + 8, readLE<64>(default_secret + i * 16 + 8) - seed);
+ }
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t len_1to3(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+ {
+ if constexpr (N == 64)
+ {
+ uint8_t const c1 = input[0];
+ uint8_t const c2 = input[len >> 1];
+ uint8_t const c3 = input[len - 1];
+ uint32_t const combined = ((uint32_t)c1 << 16) | (((uint32_t)c2) << 24) | (((uint32_t)c3) << 0) | (((uint32_t)len) << 8);
+ uint64_t const bitflip = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
+ uint64_t const keyed = (uint64_t)combined ^ bitflip;
+ return detail::avalanche<64>(keyed);
+ }
+ else
+ {
+ uint8_t const c1 = input[0];
+ uint8_t const c2 = input[len >> 1];
+ uint8_t const c3 = input[len - 1];
+ uint32_t const combinedl = ((uint32_t)c1 << 16) + (((uint32_t)c2) << 24) + (((uint32_t)c3) << 0) + (((uint32_t)len) << 8);
+ uint32_t const combinedh = rotl<32>(swap<32>(combinedl), 13);
+ uint64_t const bitflipl = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
+ uint64_t const bitfliph = (readLE<32>(secret + 8) ^ readLE<32>(secret + 12)) - seed;
+ uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
+ uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
+ hash128_t const h128 = { detail::avalanche<64>(keyed_lo), detail::avalanche<64>(keyed_hi)};
+
+ return h128;
+ }
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t len_4to8(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+ {
+ constexpr uint64_t mix_constant = 0x9FB21C651E98DF25ULL;
+
+ seed ^= (uint64_t)swap<32>((uint32_t)seed) << 32;
+
+ if constexpr (N == 64)
+ {
+ uint32_t const input1 = readLE<32>(input);
+ uint32_t const input2 = readLE<32>(input + len - 4);
+ uint64_t const bitflip = (readLE<64>(secret + 8) ^ readLE<64>(secret + 16)) - seed;
+ uint64_t const input64 = input2 + ((uint64_t)input1 << 32);
+ uint64_t keyed = input64 ^ bitflip;
+
+ return rrmxmx(keyed, len);
+ }
+ else
+ {
+ uint32_t const input_lo = readLE<32>(input);
+ uint32_t const input_hi = readLE<32>(input + len - 4);
+ uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
+ uint64_t const bitflip = (readLE<64>(secret + 16) ^ readLE<64>(secret + 24)) + seed;
+ uint64_t const keyed = input_64 ^ bitflip;
+ uint128_t m128 = mul64to128(keyed, PRIME<64>(1) + (len << 2));
+
+ m128.high64 += (m128.low64 << 1);
+ m128.low64 ^= (m128.high64 >> 3);
+ m128.low64 ^= (m128.low64 >> 35);
+ m128.low64 *= mix_constant;
+ m128.low64 ^= (m128.low64 >> 28);
+ m128.high64 = avalanche(m128.high64);
+
+ return m128;
+ }
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t len_9to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+ {
+ if constexpr (N == 64)
+ {
+ uint64_t const bitflip1 = (readLE<64>(secret + 24) ^ readLE<64>(secret + 32)) + seed;
+ uint64_t const bitflip2 = (readLE<64>(secret + 40) ^ readLE<64>(secret + 48)) - seed;
+ uint64_t const input_lo = readLE<64>(input) ^ bitflip1;
+ uint64_t const input_hi = readLE<64>(input + len - 8) ^ bitflip2;
+ uint64_t const acc = len + swap<64>(input_lo) + input_hi + mul128fold64(input_lo, input_hi);
+
+ return avalanche(acc);
+ }
+ else
+ {
+ uint64_t const bitflipl = (readLE<64>(secret + 32) ^ readLE<64>(secret + 40)) - seed;
+ uint64_t const bitfliph = (readLE<64>(secret + 48) ^ readLE<64>(secret + 56)) + seed;
+ uint64_t const input_lo = readLE<64>(input);
+ uint64_t input_hi = readLE<64>(input + len - 8);
+ uint128_t m128 = mul64to128(input_lo ^ input_hi ^ bitflipl, PRIME<64>(1));
+
+ m128.low64 += (uint64_t)(len - 1) << 54;
+ input_hi ^= bitfliph;
+
+ if constexpr (sizeof(void*) < sizeof(uint64_t)) // 32-bit version
+ {
+ m128.high64 += (input_hi & 0xFFFFFFFF00000000) + mul32to64((uint32_t)input_hi, PRIME<32>(2));
+ }
+ else
+ {
+ m128.high64 += input_hi + mul32to64((uint32_t)input_hi, PRIME<32>(2) - 1);
+ }
+
+ m128.low64 ^= swap<64>(m128.high64);
+
+ hash128_t h128 = mul64to128(m128.low64, PRIME<64>(2));
+
+ h128.high64 += m128.high64 * PRIME<64>(2);
+ h128.low64 = avalanche(h128.low64);
+ h128.high64 = avalanche(h128.high64);
+
+ return h128;
+ }
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t len_0to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+ {
+ if (XXH_likely(len > 8))
+ {
+ return len_9to16(input, len, secret, seed);
+ }
+ else if (XXH_likely(len >= 4))
+ {
+ return len_4to8(input, len, secret, seed);
+ }
+ else if (len)
+ {
+ return len_1to3(input, len, secret, seed);
+ }
+ else
+ {
+ if constexpr (N == 64)
+ {
+ return detail::avalanche<64>((seed) ^ (readLE<64>(secret + 56) ^ readLE<64>(secret + 64)));
+ }
+ else
+ {
+ uint64_t const bitflipl = readLE<64>(secret + 64) ^ readLE<64>(secret + 72);
+ uint64_t const bitfliph = readLE<64>(secret + 80) ^ readLE<64>(secret + 88);
+
+ return hash128_t(detail::avalanche<64>(( seed) ^ bitflipl), detail::avalanche<64>(( seed) ^ bitfliph));
+ }
+ }
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t hash_long_internal(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret = default_secret, size_t secretSize = sizeof(default_secret))
+ {
+ alignas(acc_align) std::array acc = init_acc;
+
+ if constexpr (N == 64)
+ {
+ hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
+
+ /* converge into final hash */
+ return merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
+ }
+ else
+ {
+ hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
+
+ /* converge into final hash */
+ uint64_t const low64 = merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
+ uint64_t const high64 = merge_accs(acc.data(), secret + secretSize - sizeof(acc) - secret_mergeaccs_start, ~((uint64_t)len * PRIME<64>(2)));
+
+ return hash128_t(low64, high64);
+ }
+ }
+
+ XXH_FORCE_INLINE uint64_t mix_16b(const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+ {
+ uint64_t const input_lo = readLE<64>(input);
+ uint64_t const input_hi = readLE<64>(input + 8);
+
+ return mul128fold64(input_lo ^ (readLE<64>(secret) + seed), input_hi ^ (readLE<64>(secret + 8) - seed));
+ }
+
+ XXH_FORCE_INLINE uint128_t mix_32b(uint128_t acc, const uint8_t* input1, const uint8_t* input2, const uint8_t* secret, uint64_t seed)
+ {
+ acc.low64 += mix_16b(input1, secret + 0, seed);
+ acc.low64 ^= readLE<64>(input2) + readLE<64>(input2 + 8);
+ acc.high64 += mix_16b(input2, secret + 16, seed);
+ acc.high64 ^= readLE<64>(input1) + readLE<64>(input1 + 8);
+
+ return acc;
+ }
+
+ template
+ XXH_FORCE_INLINE hash_t len_17to128(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+ {
+ if constexpr (N == 64)
+ {
+ hash64_t acc = len * PRIME<64>(1);
+
+ if (len > 32)
+ {
+ if (len > 64)
+ {
+ if (len > 96)
+ {
+ acc += mix_16b(input + 48, secret + 96, seed);
+ acc += mix_16b(input + len - 64, secret + 112, seed);
+ }
+
+ acc += mix_16b(input + 32, secret + 64, seed);
+ acc += mix_16b(input + len - 48, secret + 80, seed);
+ }
+
+ acc += mix_16b(input + 16, secret + 32, seed);
+ acc += mix_16b(input + len - 32, secret + 48, seed);
+ }
+
+ acc += mix_16b(input + 0, secret + 0, seed);
+ acc += mix_16b(input + len - 16, secret + 16, seed);
+
+ return avalanche(acc);
+ }
+ else
+ {
+ hash128_t acc = { len * PRIME<64>(1), 0 };
+
+ if (len > 32)
+ {
+ if (len > 64)
+ {
+ if (len > 96)
+ {
+ acc = mix_32b(acc, input + 48, input + len - 64, secret + 96, seed);
+ }
+
+ acc = mix_32b(acc, input + 32, input + len - 48, secret + 64, seed);
+ }
+
+ acc = mix_32b(acc, input + 16, input + len - 32, secret + 32, seed);
+ }
+
+ acc = mix_32b(acc, input, input + len - 16, secret, seed);
+
+ uint64_t const low64 = acc.low64 + acc.high64;
+ uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
+
+ return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
+ }
+ }
+
+ template
+ XXH_NO_INLINE hash_t len_129to240(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+ {
+ if constexpr (N == 64)
+ {
+ uint64_t acc = len * PRIME<64>(1);
+ size_t const nbRounds = len / 16;
+
+ for (size_t i = 0; i < 8; i++)
+ {
+ acc += mix_16b(input + (i * 16), secret + (i * 16), seed);
+ }
+
+ acc = avalanche(acc);
+
+ for (size_t i = 8; i < nbRounds; i++)
+ {
+ acc += mix_16b(input + (i * 16), secret + ((i - 8) * 16) + midsize_startoffset, seed);
+ }
+
+ /* last bytes */
+ acc += mix_16b(input + len - 16, secret + secret_size_min - midsize_lastoffset, seed);
+
+ return avalanche(acc);
+ }
+ else
+ {
+ hash128_t acc;
+ uint64_t const nbRounds = len / 32;
+
+ acc.low64 = len * PRIME<64>(1);
+ acc.high64 = 0;
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + (i * 32), seed);
+ }
+
+ acc.low64 = avalanche(acc.low64);
+ acc.high64 = avalanche(acc.high64);
+
+ for (size_t i = 4; i < nbRounds; i++)
+ {
+ acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + midsize_startoffset + ((i - 4) * 32), seed);
+ }
+
+ /* last bytes */
+ acc = mix_32b(acc, input + len - 16, input + len - 32, secret + secret_size_min - midsize_lastoffset - 16, 0ULL - seed);
+
+ uint64_t const low64 = acc.low64 + acc.high64;
+ uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
+
+ return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
+ }
+
+ }
+
+ template
+ XXH_NO_INLINE hash_t xxhash3_impl(const void* XXH_RESTRICT input, size_t len, hash64_t seed, const void* XXH_RESTRICT secret = default_secret, size_t secretSize = secret_default_size)
+ {
+
+ alignas(64) uint8_t custom_secret[secret_default_size];
+
+ const void* short_secret = secret;
+
+ if (seed != 0)
+ {
+ init_custom_secret(custom_secret, seed);
+ short_secret = default_secret;
+ }
+
+ if (len <= 16)
+ {
+ return len_0to16(static_cast(input), len, static_cast(short_secret), seed);
+ }
+ else if (len <= 128)
+ {
+ return len_17to128(static_cast(input), len, static_cast(short_secret), seed);
+ }
+ else if (len <= midsize_max)
+ {
+ return len_129to240(static_cast(input), len, static_cast(short_secret), seed);
+ }
+ else
+ {
+ return hash_long_internal(static_cast(input), len, static_cast(((seed == 0) ? secret : ((secret == default_secret) ? custom_secret : secret))), ((seed == 0) ? secretSize : ((secret == default_secret) ? secret_default_size : secretSize)));
+ }
+ }
+
+ XXH_NO_INLINE void generate_secret(void* secret_buffer, size_t secret_size, const void* custom_seed, size_t seed_size)
+ {
+ if (seed_size == 0)
+ {
+ custom_seed = default_secret;
+ seed_size = secret_default_size;
+ }
+
+ size_t pos = 0;
+ while (pos < secret_size)
+ {
+ size_t const copy_len = std::min(secret_size - pos, seed_size);
+ memcpy((uint8_t*)secret_buffer + pos, custom_seed, copy_len);
+ pos += copy_len;
+ }
+
+ size_t const nbseg16 = secret_size / 16;
+ canonical128_t scrambled(xxhash3_impl<128>(custom_seed, seed_size, 0));
+ for (size_t n = 0; n < nbseg16; n++)
+ {
+ hash128_t const h128 = xxhash3_impl<128>(&scrambled, sizeof(scrambled), n);
+ combine_16((uint8_t*)secret_buffer + n * 16, h128);
+ }
+
+ combine_16((uint8_t*)secret_buffer + secret_size - 16, scrambled.get_hash());
+ }
+ }
+
+
+ /* *************************************
+ * Public Access Point - xxhash
+ ***************************************/
+
+ template
+ inline hash_t xxhash(const void* input, size_t len, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ return detail::endian_align(input, len, seed);
+ }
+
+ template
+ inline hash_t xxhash(const std::basic_string& input, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ return detail::endian_align(static_cast(input.data()), input.length() * sizeof(T), seed);
+ }
+
+ template
+ inline hash_t xxhash(ContiguousIterator begin, ContiguousIterator end, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ using T = typename std::decay_t;
+ return detail::endian_align(static_cast(&*begin), (end - begin) * sizeof(T), seed);
+ }
+
+ template
+ inline hash_t xxhash(const std::vector& input, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ return detail::endian_align(static_cast(input.data()), input.size() * sizeof(T), seed);
+ }
+
+ template
+ inline hash_t xxhash(const std::array& input, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ return detail::endian_align(static_cast(input.data()), AN * sizeof(T), seed);
+ }
+
+ template
+ inline hash_t xxhash(const std::initializer_list& input, uint_t seed = 0)
+ {
+ static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+ return detail::endian_align(static_cast(input.begin()), input.size() * sizeof(T), seed);
+ }
+
+
+ /* *************************************
+ * Public Access Point - xxhash3
+ ***************************************/
+
+ template
+ inline hash_t xxhash3(const void* input, size_t len, uint64_t seed = 0)
+ {
+ static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+ return detail3::xxhash3_impl(input, len, seed);
+ }
+
+ template
+ inline hash_t xxhash3(const void* input, size_t len, const void* secret, size_t secretSize, uint64_t seed = 0)
+ {
+ static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+ return detail3::xxhash3_impl(input, len, seed, secret, secretSize);
+ }
+
+ template
+ inline hash_t xxhash3(const std::basic_string& input, uint64_t seed = 0)
+ {
+ static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+ return detail3::xxhash3_impl(static_cast