diff --git a/curve25519-dalek/Cargo.toml b/curve25519-dalek/Cargo.toml index c4fb28a38..298bd6e30 100644 --- a/curve25519-dalek/Cargo.toml +++ b/curve25519-dalek/Cargo.toml @@ -69,7 +69,7 @@ legacy_compatibility = [] group = ["dep:group", "rand_core"] group-bits = ["group", "ff/bits"] -[target.'cfg(all(not(curve25519_dalek_backend = "fiat"), not(curve25519_dalek_backend = "serial"), target_arch = "x86_64"))'.dependencies] +[target.'cfg(all(not(curve25519_dalek_backend = "fiat"), not(curve25519_dalek_backend = "serial"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "arm")))'.dependencies] curve25519-dalek-derive = { version = "0.1", path = "../curve25519-dalek-derive" } [lints.rust.unexpected_cfgs] diff --git a/curve25519-dalek/build.rs b/curve25519-dalek/build.rs index 97fa28524..ddf671bb8 100644 --- a/curve25519-dalek/build.rs +++ b/curve25519-dalek/build.rs @@ -3,7 +3,7 @@ #![deny(clippy::unwrap_used, dead_code)] #[allow(non_camel_case_types)] -#[derive(PartialEq, Debug)] +#[derive(PartialEq, Clone, Copy, Debug)] enum DalekBits { Dalek32, Dalek64, @@ -51,31 +51,34 @@ fn main() { } // Backend overrides / defaults - let curve25519_dalek_backend = - match std::env::var("CARGO_CFG_CURVE25519_DALEK_BACKEND").as_deref() { - Ok("fiat") => "fiat", - Ok("serial") => "serial", - Ok("simd") => { - // simd can only be enabled on x86_64 & 64bit target_pointer_width - match is_capable_simd(&target_arch, curve25519_dalek_bits) { + let curve25519_dalek_backend = match std::env::var("CARGO_CFG_CURVE25519_DALEK_BACKEND") + .as_deref() + { + Ok("fiat") => "fiat", + Ok("serial") => "serial", + Ok("simd") => { + // simd can only be enabled on x86_64 & 64bit target_pointer_width, or + // armv7 & 32bit target_pointer_width + match is_capable_simd(&target_arch, curve25519_dalek_bits) { true => "simd", // If override is not possible this must result to compile error // See: issues/532 - false => panic!("Could not override curve25519_dalek_backend to simd"), + false => panic!("Could not override curve25519_dalek_backend to simd for arch {target_arch} and {curve25519_dalek_bits} bits"), } - } - // default between serial / simd (if potentially capable) - _ => match is_capable_simd(&target_arch, curve25519_dalek_bits) { - true => "simd", - false => "serial", - }, - }; + } + // default between serial / simd (if potentially capable) + _ => match is_capable_simd(&target_arch, curve25519_dalek_bits) { + true => "simd", + false => "serial", + }, + }; println!("cargo:rustc-cfg=curve25519_dalek_backend=\"{curve25519_dalek_backend}\""); } // Is the target arch & curve25519_dalek_bits potentially simd capable ? fn is_capable_simd(arch: &str, bits: DalekBits) -> bool { - arch == "x86_64" && bits == DalekBits::Dalek64 + (arch == "x86_64" || arch == "aarch64") && bits == DalekBits::Dalek64 + || arch == "arm" && bits == DalekBits::Dalek32 } // Deterministic cfg(curve25519_dalek_bits) when this is not explicitly set. diff --git a/curve25519-dalek/src/backend/mod.rs b/curve25519-dalek/src/backend/mod.rs index 9ad1dd3de..4fcc7f627 100644 --- a/curve25519-dalek/src/backend/mod.rs +++ b/curve25519-dalek/src/backend/mod.rs @@ -44,16 +44,22 @@ pub mod vector; #[derive(Copy, Clone)] enum BackendKind { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] Avx2, - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] Avx512, + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + Neon, Serial, } #[inline] fn get_selected_backend() -> BackendKind { - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + { + return BackendKind::Neon; + } + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] { cpufeatures::new!(cpuid_avx512, "avx512ifma", "avx512vl"); let token_avx512: cpuid_avx512::InitToken = cpuid_avx512::init(); @@ -62,7 +68,7 @@ fn get_selected_backend() -> BackendKind { } } - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] { cpufeatures::new!(cpuid_avx2, "avx2"); let token_avx2: cpuid_avx2::InitToken = cpuid_avx2::init(); @@ -85,12 +91,15 @@ where use crate::traits::VartimeMultiscalarMul; match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch="x86_64"))] BackendKind::Avx2 => vector::scalar_mul::pippenger::spec_avx2::Pippenger::optional_multiscalar_mul::(scalars, points), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch="x86_64"))] BackendKind::Avx512 => vector::scalar_mul::pippenger::spec_avx512ifma_avx512vl::Pippenger::optional_multiscalar_mul::(scalars, points), + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch="aarch64"))] + BackendKind::Neon => + vector::scalar_mul::pippenger::spec_neon::Pippenger::optional_multiscalar_mul::(scalars, points), BackendKind::Serial => serial::scalar_mul::pippenger::Pippenger::optional_multiscalar_mul::(scalars, points), } @@ -98,12 +107,14 @@ where #[cfg(feature = "alloc")] pub(crate) enum VartimePrecomputedStraus { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] Avx2(vector::scalar_mul::precomputed_straus::spec_avx2::VartimePrecomputedStraus), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] Avx512ifma( vector::scalar_mul::precomputed_straus::spec_avx512ifma_avx512vl::VartimePrecomputedStraus, ), + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + Neon(vector::scalar_mul::precomputed_straus::spec_neon::VartimePrecomputedStraus), Scalar(serial::scalar_mul::precomputed_straus::VartimePrecomputedStraus), } @@ -117,12 +128,15 @@ impl VartimePrecomputedStraus { use crate::traits::VartimePrecomputedMultiscalarMul; match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch="x86_64"))] BackendKind::Avx2 => VartimePrecomputedStraus::Avx2(vector::scalar_mul::precomputed_straus::spec_avx2::VartimePrecomputedStraus::new(static_points)), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch="x86_64"))] BackendKind::Avx512 => VartimePrecomputedStraus::Avx512ifma(vector::scalar_mul::precomputed_straus::spec_avx512ifma_avx512vl::VartimePrecomputedStraus::new(static_points)), + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch="aarch64"))] + BackendKind::Neon => + VartimePrecomputedStraus::Neon(vector::scalar_mul::precomputed_straus::spec_neon::VartimePrecomputedStraus::new(static_points)), BackendKind::Serial => VartimePrecomputedStraus::Scalar(serial::scalar_mul::precomputed_straus::VartimePrecomputedStraus::new(static_points)) } @@ -144,18 +158,24 @@ impl VartimePrecomputedStraus { use crate::traits::VartimePrecomputedMultiscalarMul; match self { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] VartimePrecomputedStraus::Avx2(inner) => inner.optional_mixed_multiscalar_mul( static_scalars, dynamic_scalars, dynamic_points, ), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] VartimePrecomputedStraus::Avx512ifma(inner) => inner.optional_mixed_multiscalar_mul( static_scalars, dynamic_scalars, dynamic_points, ), + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + VartimePrecomputedStraus::Neon(inner) => inner.optional_mixed_multiscalar_mul( + static_scalars, + dynamic_scalars, + dynamic_points, + ), VartimePrecomputedStraus::Scalar(inner) => inner.optional_mixed_multiscalar_mul( static_scalars, dynamic_scalars, @@ -177,16 +197,20 @@ where use crate::traits::MultiscalarMul; match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] BackendKind::Avx2 => { vector::scalar_mul::straus::spec_avx2::Straus::multiscalar_mul::(scalars, points) } - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] BackendKind::Avx512 => { vector::scalar_mul::straus::spec_avx512ifma_avx512vl::Straus::multiscalar_mul::( scalars, points, ) } + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + BackendKind::Neon => { + vector::scalar_mul::straus::spec_neon::Straus::multiscalar_mul::(scalars, points) + } BackendKind::Serial => { serial::scalar_mul::straus::Straus::multiscalar_mul::(scalars, points) } @@ -204,19 +228,25 @@ where use crate::traits::VartimeMultiscalarMul; match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] BackendKind::Avx2 => { vector::scalar_mul::straus::spec_avx2::Straus::optional_multiscalar_mul::( scalars, points, ) } - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] BackendKind::Avx512 => { vector::scalar_mul::straus::spec_avx512ifma_avx512vl::Straus::optional_multiscalar_mul::< I, J, >(scalars, points) } + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + BackendKind::Neon => { + vector::scalar_mul::straus::spec_neon::Straus::optional_multiscalar_mul::( + scalars, points, + ) + } BackendKind::Serial => { serial::scalar_mul::straus::Straus::optional_multiscalar_mul::(scalars, points) } @@ -226,12 +256,14 @@ where /// Perform constant-time, variable-base scalar multiplication. pub fn variable_base_mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint { match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] BackendKind::Avx2 => vector::scalar_mul::variable_base::spec_avx2::mul(point, scalar), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] BackendKind::Avx512 => { vector::scalar_mul::variable_base::spec_avx512ifma_avx512vl::mul(point, scalar) } + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + BackendKind::Neon => vector::scalar_mul::variable_base::spec_neon::mul(point, scalar), BackendKind::Serial => serial::scalar_mul::variable_base::mul(point, scalar), } } @@ -240,12 +272,14 @@ pub fn variable_base_mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint #[allow(non_snake_case)] pub fn vartime_double_base_mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint { match get_selected_backend() { - #[cfg(curve25519_dalek_backend = "simd")] + #[cfg(all(curve25519_dalek_backend = "simd", target_arch = "x86_64"))] BackendKind::Avx2 => vector::scalar_mul::vartime_double_base::spec_avx2::mul(a, A, b), - #[cfg(all(curve25519_dalek_backend = "simd", nightly))] + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "x86_64"))] BackendKind::Avx512 => { vector::scalar_mul::vartime_double_base::spec_avx512ifma_avx512vl::mul(a, A, b) } + #[cfg(all(curve25519_dalek_backend = "simd", nightly, target_arch = "aarch64"))] + BackendKind::Neon => vector::scalar_mul::vartime_double_base::spec_neon::mul(a, A, b), BackendKind::Serial => serial::scalar_mul::vartime_double_base::mul(a, A, b), } } diff --git a/curve25519-dalek/src/backend/vector/mod.rs b/curve25519-dalek/src/backend/vector/mod.rs index 2839dca45..dff816f75 100644 --- a/curve25519-dalek/src/backend/vector/mod.rs +++ b/curve25519-dalek/src/backend/vector/mod.rs @@ -12,11 +12,16 @@ #![doc = include_str!("../../../docs/parallel-formulas.md")] #[allow(missing_docs)] +#[cfg(target_arch = "x86_64")] pub mod packed_simd; +#[cfg(target_arch = "x86_64")] pub mod avx2; -#[cfg(nightly)] +#[cfg(all(nightly, target_arch = "x86_64"))] pub mod ifma; +#[cfg(all(nightly, any(target_arch = "arm", target_arch = "aarch64")))] +pub mod neon; + pub mod scalar_mul; diff --git a/curve25519-dalek/src/backend/vector/neon/constants.rs b/curve25519-dalek/src/backend/vector/neon/constants.rs new file mode 100644 index 000000000..befcf1f05 --- /dev/null +++ b/curve25519-dalek/src/backend/vector/neon/constants.rs @@ -0,0 +1,1500 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence +// 2021-2022 Robrecht Blancquaert +// See LICENSE for licensing information. +// +// Authors: +// - Isis Agora Lovecruft +// - Henry de Valence +// - Robrecht Blancquaert + +//! This module contains constants used by the NEON backend. + +use super::packed_simd::{u32x4, u32x4x2}; + +use crate::backend::vector::neon::edwards::{CachedPoint, ExtendedPoint}; +use crate::backend::vector::neon::field::FieldElement2625x4; +use crate::window::NafLookupTable8; + +/// The identity element as an `ExtendedPoint`. +pub(crate) static EXTENDEDPOINT_IDENTITY: ExtendedPoint = ExtendedPoint(FieldElement2625x4([ + u32x4x2::new(u32x4::const_new(0, 1, 0, 0), u32x4::const_new(1, 0, 0, 0)), + u32x4x2::new(u32x4::const_splat(0), u32x4::const_splat(0)), + u32x4x2::new(u32x4::const_splat(0), u32x4::const_splat(0)), + u32x4x2::new(u32x4::const_splat(0), u32x4::const_splat(0)), + u32x4x2::new(u32x4::const_splat(0), u32x4::const_splat(0)), +])); + +/// The identity element as a `CachedPoint`. +pub(crate) static CACHEDPOINT_IDENTITY: CachedPoint = CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(121647, 121666, 0, 0), + u32x4::const_new(243332, 67108845, 0, 33554431), + ), + u32x4x2::new( + u32x4::const_new(67108864, 0, 33554431, 0), + u32x4::const_new(0, 67108863, 0, 33554431), + ), + u32x4x2::new( + u32x4::const_new(67108863, 0, 33554431, 0), + u32x4::const_new(0, 67108863, 0, 33554431), + ), + u32x4x2::new( + u32x4::const_new(67108863, 0, 33554431, 0), + u32x4::const_new(0, 67108863, 0, 33554431), + ), + u32x4x2::new( + u32x4::const_new(67108863, 0, 33554431, 0), + u32x4::const_new(0, 67108863, 0, 33554431), + ), +])); + +/// The low limbs of (2p, 2p, 2p, 2p), so that +/// ```ascii,no_run +/// (2p, 2p, 2p, 2p) = [P_TIMES_2_LO, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI] +/// ``` +pub(crate) static P_TIMES_2_LO: u32x4x2 = u32x4x2::new( + u32x4::const_new(67108845 << 1, 67108845 << 1, 33554431 << 1, 33554431 << 1), + u32x4::const_new(67108845 << 1, 67108845 << 1, 33554431 << 1, 33554431 << 1), +); + +/// The high limbs of (2p, 2p, 2p, 2p), so that +/// ```ascii,no_run +/// (2p, 2p, 2p, 2p) = [P_TIMES_2_LO, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI, P_TIMES_2_HI] +/// ``` +pub(crate) static P_TIMES_2_HI: u32x4x2 = u32x4x2::new( + u32x4::const_new(67108863 << 1, 67108863 << 1, 33554431 << 1, 33554431 << 1), + u32x4::const_new(67108863 << 1, 67108863 << 1, 33554431 << 1, 33554431 << 1), +); + +/// The low limbs of (16p, 16p, 16p, 16p), so that +/// ```ascii,no_run +/// (16p, 16p, 16p, 16p) = [P_TIMES_16_LO, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI] +/// ``` +pub(crate) static P_TIMES_16_LO: u32x4x2 = u32x4x2::new( + u32x4::const_new(67108845 << 4, 67108845 << 4, 33554431 << 4, 33554431 << 4), + u32x4::const_new(67108845 << 4, 67108845 << 4, 33554431 << 4, 33554431 << 4), +); + +/// The high limbs of (16p, 16p, 16p, 16p), so that +/// ```ascii,no_run +/// (16p, 16p, 16p, 16p) = [P_TIMES_16_LO, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI, P_TIMES_16_HI] +/// ``` +pub(crate) static P_TIMES_16_HI: u32x4x2 = u32x4x2::new( + u32x4::const_new(67108863 << 4, 67108863 << 4, 33554431 << 4, 33554431 << 4), + u32x4::const_new(67108863 << 4, 67108863 << 4, 33554431 << 4, 33554431 << 4), +); + +/// Odd multiples of the Ed25519 basepoint: +pub(crate) static BASEPOINT_ODD_LOOKUP_TABLE: NafLookupTable8 = NafLookupTable8([ + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(3571425, 10045002, 19036563, 1096096), + u32x4::const_new(243332, 65897020, 0, 28963681), + ), + u32x4x2::new( + u32x4::const_new(30896895, 63055514, 1614915, 5095970), + u32x4::const_new(0, 53791688, 0, 31258312), + ), + u32x4x2::new( + u32x4::const_new(13347627, 40339464, 2236269, 11185503), + u32x4::const_new(0, 22520087, 0, 8659512), + ), + u32x4x2::new( + u32x4::const_new(11125413, 29139905, 32037254, 28360723), + u32x4::const_new(0, 64556417, 0, 9635759), + ), + u32x4x2::new( + u32x4::const_new(33268144, 47262491, 4336918, 15795740), + u32x4::const_new(0, 22027545, 0, 4846528), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(47099681, 31447946, 29365447, 24740513), + u32x4::const_new(42991046, 18317844, 16051644, 21404226), + ), + u32x4x2::new( + u32x4::const_new(31708133, 28909527, 2366091, 13703791), + u32x4::const_new(469246, 54159622, 2601402, 32988002), + ), + u32x4x2::new( + u32x4::const_new(63432457, 30251794, 15163516, 18491340), + u32x4::const_new(28144087, 35605455, 13682295, 18474872), + ), + u32x4x2::new( + u32x4::const_new(12221607, 4967598, 26061980, 26008006), + u32x4::const_new(20226147, 9726961, 17410, 18051083), + ), + u32x4x2::new( + u32x4::const_new(60569645, 62487085, 11911242, 21920922), + u32x4::const_new(4092105, 38186967, 22431483, 31366585), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(18147205, 62587998, 2554617, 536692), + u32x4::const_new(11924528, 26674131, 17645433, 24341419), + ), + u32x4x2::new( + u32x4::const_new(11573357, 27579485, 31491870, 29000885), + u32x4::const_new(10800976, 51902791, 28076395, 20464029), + ), + u32x4x2::new( + u32x4::const_new(56031649, 10856669, 11791193, 26769430), + u32x4::const_new(25306956, 5922200, 6630685, 9385098), + ), + u32x4x2::new( + u32x4::const_new(31319348, 23906711, 16290213, 32142166), + u32x4::const_new(61106354, 17181823, 3548308, 12022566), + ), + u32x4x2::new( + u32x4::const_new(5904298, 50218605, 11826440, 5492249), + u32x4::const_new(10379071, 3472255, 172742, 31948344), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(10625852, 15193821, 22918394, 23676410), + u32x4::const_new(53695416, 54987793, 10067515, 11747680), + ), + u32x4x2::new( + u32x4::const_new(65013325, 1309652, 29616320, 28922974), + u32x4::const_new(60360891, 19621771, 9938982, 30406429), + ), + u32x4x2::new( + u32x4::const_new(54967954, 65931918, 5595602, 25719523), + u32x4::const_new(64909864, 30566415, 15945272, 8495317), + ), + u32x4x2::new( + u32x4::const_new(1167157, 55265018, 11507029, 31641054), + u32x4::const_new(43497904, 2367338, 12937761, 27517066), + ), + u32x4x2::new( + u32x4::const_new(656704, 2544994, 13006713, 480979), + u32x4::const_new(38471594, 62541240, 25353597, 11531760), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(22176662, 3984313, 27495285, 4110608), + u32x4::const_new(2909584, 30594106, 15677919, 2549183), + ), + u32x4x2::new( + u32x4::const_new(33979105, 62269905, 2071511, 6894756), + u32x4::const_new(53189950, 47232857, 6408191, 6123225), + ), + u32x4x2::new( + u32x4::const_new(32553873, 63948030, 12612401, 3633166), + u32x4::const_new(24054373, 37626618, 14481327, 8520484), + ), + u32x4x2::new( + u32x4::const_new(56552486, 10749438, 12034813, 28811946), + u32x4::const_new(1445640, 36755601, 12104575, 10257833), + ), + u32x4x2::new( + u32x4::const_new(22795808, 48761311, 1136056, 9380768), + u32x4::const_new(1411523, 5341811, 27318329, 9686767), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(21157200, 39156966, 20473176, 4934657), + u32x4::const_new(61478183, 45121537, 5429856, 13035023), + ), + u32x4x2::new( + u32x4::const_new(7954529, 58789246, 31440083, 7054221), + u32x4::const_new(38438565, 36856107, 1364112, 14548122), + ), + u32x4x2::new( + u32x4::const_new(26120083, 36321360, 4919997, 31687496), + u32x4::const_new(33757765, 36237559, 15243054, 32163861), + ), + u32x4x2::new( + u32x4::const_new(25878307, 46544824, 19455951, 2414935), + u32x4::const_new(16844726, 56521560, 32680554, 26660660), + ), + u32x4x2::new( + u32x4::const_new(48360220, 43407178, 12187042, 24925816), + u32x4::const_new(7423722, 25746484, 12814654, 17395963), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(63153652, 32195955, 4087908, 8431689), + u32x4::const_new(30392384, 47203165, 8986649, 9053039), + ), + u32x4x2::new( + u32x4::const_new(63659241, 47988767, 2931872, 19953600), + u32x4::const_new(11747107, 51610101, 20952181, 13364887), + ), + u32x4x2::new( + u32x4::const_new(3659197, 58790649, 5930099, 2605312), + u32x4::const_new(28477896, 580728, 20579735, 2610622), + ), + u32x4x2::new( + u32x4::const_new(41781607, 17161358, 10690531, 24368015), + u32x4::const_new(47027031, 36742339, 5414694, 13156365), + ), + u32x4x2::new( + u32x4::const_new(13237853, 51182423, 8954802, 29006542), + u32x4::const_new(22643989, 56896541, 22830593, 10289708), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(1401265, 58846825, 30911620, 32239180), + u32x4::const_new(15391552, 15200821, 6339309, 16403588), + ), + u32x4x2::new( + u32x4::const_new(55913797, 29541724, 1664461, 21709410), + u32x4::const_new(38470488, 47097092, 17674945, 32666066), + ), + u32x4x2::new( + u32x4::const_new(22844482, 10797709, 27548106, 31638735), + u32x4::const_new(34500968, 26611503, 19727211, 13160873), + ), + u32x4x2::new( + u32x4::const_new(31485204, 14496164, 13981208, 10276888), + u32x4::const_new(5748808, 35024436, 2740987, 7479021), + ), + u32x4x2::new( + u32x4::const_new(58541207, 14866135, 32344041, 545930), + u32x4::const_new(62661488, 6941250, 27940205, 11976112), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(39849808, 44781685, 15697329, 24387845), + u32x4::const_new(12501486, 50260092, 23199481, 31929024), + ), + u32x4x2::new( + u32x4::const_new(24823070, 27956017, 27034296, 10316465), + u32x4::const_new(47664045, 11152446, 15719183, 30181617), + ), + u32x4x2::new( + u32x4::const_new(20771189, 19969144, 31433937, 19185213), + u32x4::const_new(27565920, 10384445, 2893359, 9255362), + ), + u32x4x2::new( + u32x4::const_new(42894974, 11925545, 32134441, 32738810), + u32x4::const_new(55916336, 32479272, 19563550, 5511385), + ), + u32x4x2::new( + u32x4::const_new(17857161, 47809169, 14564114, 27997751), + u32x4::const_new(33024640, 38669671, 31956536, 27313245), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(58237774, 15917425, 18872208, 19394230), + u32x4::const_new(17374297, 6101419, 4839741, 6596900), + ), + u32x4x2::new( + u32x4::const_new(66947393, 15744215, 18368993, 17750160), + u32x4::const_new(41006525, 9205497, 2629667, 32170865), + ), + u32x4x2::new( + u32x4::const_new(66481381, 1919414, 28338762, 7372967), + u32x4::const_new(33819153, 4156199, 27126309, 12739816), + ), + u32x4x2::new( + u32x4::const_new(44117158, 58545296, 22521371, 11809712), + u32x4::const_new(28998792, 50731010, 30215699, 25748377), + ), + u32x4x2::new( + u32x4::const_new(23561284, 4160244, 9035405, 24895184), + u32x4::const_new(39761639, 59253416, 8684759, 22487864), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(12671134, 56419053, 16092401, 30038207), + u32x4::const_new(4002647, 47822606, 7151311, 28430768), + ), + u32x4x2::new( + u32x4::const_new(61041684, 35765374, 30598048, 19666539), + u32x4::const_new(44150175, 40140037, 290469, 28442674), + ), + u32x4x2::new( + u32x4::const_new(18847796, 1371617, 33316881, 13199936), + u32x4::const_new(43646578, 17068881, 12074900, 1537415), + ), + u32x4x2::new( + u32x4::const_new(10052225, 38316070, 27469797, 5297537), + u32x4::const_new(50725570, 20435349, 10339121, 2779737), + ), + u32x4x2::new( + u32x4::const_new(18372189, 15466385, 24762130, 22217964), + u32x4::const_new(23503887, 47844464, 10415034, 2606889), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(55082775, 45300503, 16032654, 5964396), + u32x4::const_new(17743504, 24634761, 19493066, 5184611), + ), + u32x4x2::new( + u32x4::const_new(50172633, 35093294, 10040575, 23616256), + u32x4::const_new(4543900, 61852191, 4049821, 7423669), + ), + u32x4x2::new( + u32x4::const_new(20295398, 40009376, 10487190, 15670429), + u32x4::const_new(51972856, 58649552, 20436392, 3432497), + ), + u32x4x2::new( + u32x4::const_new(35189420, 54117751, 12825868, 6283038), + u32x4::const_new(27540739, 30648758, 22658912, 9466689), + ), + u32x4x2::new( + u32x4::const_new(51737549, 40725785, 17409814, 25201086), + u32x4::const_new(21156239, 34176168, 26814520, 5956424), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(8211442, 8014184, 6260823, 22108096), + u32x4::const_new(32182620, 51844847, 2466270, 28582231), + ), + u32x4x2::new( + u32x4::const_new(27199739, 3848333, 31738017, 10892045), + u32x4::const_new(4963982, 65391770, 32551997, 28906469), + ), + u32x4x2::new( + u32x4::const_new(16606846, 32207068, 26404535, 7614129), + u32x4::const_new(45416902, 65584718, 13821785, 2646060), + ), + u32x4x2::new( + u32x4::const_new(36090634, 57981287, 32247670, 22837502), + u32x4::const_new(31003861, 55448117, 6062915, 20369975), + ), + u32x4x2::new( + u32x4::const_new(27381403, 50578107, 522631, 29521058), + u32x4::const_new(31137497, 40220737, 27628049, 1824195), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(59402443, 17056879, 29262689, 6131785), + u32x4::const_new(52551472, 43367471, 29423199, 18899208), + ), + u32x4x2::new( + u32x4::const_new(5749414, 43514612, 11365899, 21514624), + u32x4::const_new(65591890, 60945892, 19841732, 5628567), + ), + u32x4x2::new( + u32x4::const_new(19334369, 52500268, 12307673, 5267367), + u32x4::const_new(3212103, 9035822, 29142161, 30520954), + ), + u32x4x2::new( + u32x4::const_new(57261330, 6819646, 22089161, 9800373), + u32x4::const_new(55155453, 62250856, 13766735, 25244545), + ), + u32x4x2::new( + u32x4::const_new(54370226, 61888301, 24496089, 2540581), + u32x4::const_new(65637506, 60274355, 18154273, 11687259), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(12521903, 26014045, 13995625, 33360175), + u32x4::const_new(23605474, 7376434, 27229267, 17195036), + ), + u32x4x2::new( + u32x4::const_new(59482891, 10074423, 574357, 3857753), + u32x4::const_new(61377787, 50306685, 5241065, 20234396), + ), + u32x4x2::new( + u32x4::const_new(23674717, 6997172, 20771841, 16858511), + u32x4::const_new(40565304, 29973136, 7049812, 14585010), + ), + u32x4x2::new( + u32x4::const_new(1427477, 13295732, 31762066, 31499740), + u32x4::const_new(60419925, 54666164, 22009424, 8089609), + ), + u32x4x2::new( + u32x4::const_new(58154031, 41593020, 15342328, 957047), + u32x4::const_new(38937260, 37037498, 24871992, 32973409), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(30654745, 51286025, 21206982, 2433562), + u32x4::const_new(12780105, 31732574, 33087964, 33081189), + ), + u32x4x2::new( + u32x4::const_new(66640017, 42720009, 16567620, 15300745), + u32x4::const_new(1530367, 33001123, 20930247, 21042661), + ), + u32x4x2::new( + u32x4::const_new(15003356, 5294119, 22985605, 18928772), + u32x4::const_new(32628461, 18230172, 14773298, 27193722), + ), + u32x4x2::new( + u32x4::const_new(27555, 65346287, 17017174, 7837720), + u32x4::const_new(21499787, 42855613, 22474984, 13675085), + ), + u32x4x2::new( + u32x4::const_new(24164369, 50130116, 5973149, 24152073), + u32x4::const_new(1577334, 25400030, 18648484, 32228854), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(49518649, 59119280, 31670678, 20396561), + u32x4::const_new(61728330, 651402, 176032, 9529498), + ), + u32x4x2::new( + u32x4::const_new(61765532, 9082232, 32794568, 15526956), + u32x4::const_new(48543100, 32614212, 19001206, 25680229), + ), + u32x4x2::new( + u32x4::const_new(32086091, 10373081, 8996131, 31822823), + u32x4::const_new(35788988, 49973190, 30542040, 17858455), + ), + u32x4x2::new( + u32x4::const_new(48130197, 58121889, 27753291, 29923268), + u32x4::const_new(54448075, 43300790, 9336565, 15770022), + ), + u32x4x2::new( + u32x4::const_new(57725546, 20557498, 9366233, 16023566), + u32x4::const_new(16189031, 2837363, 24315301, 27003505), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(28286608, 10767548, 18220739, 5413236), + u32x4::const_new(48253387, 58255702, 11864864, 28527159), + ), + u32x4x2::new( + u32x4::const_new(45038176, 58655197, 25648758, 10951484), + u32x4::const_new(42564382, 34542843, 23146954, 22234334), + ), + u32x4x2::new( + u32x4::const_new(14858710, 24978793, 15040559, 4379220), + u32x4::const_new(47621477, 40271440, 15650420, 1998736), + ), + u32x4x2::new( + u32x4::const_new(24106391, 9626149, 344505, 25253814), + u32x4::const_new(34579800, 59687089, 25718289, 25904133), + ), + u32x4x2::new( + u32x4::const_new(1981195, 37751302, 26132048, 1764722), + u32x4::const_new(13288231, 28808622, 12531301, 18292949), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(13869851, 31448904, 14963539, 7581293), + u32x4::const_new(20536485, 35021083, 21257574, 33356609), + ), + u32x4x2::new( + u32x4::const_new(36903364, 18429241, 11097857, 5943856), + u32x4::const_new(60583077, 40015815, 30509523, 31915271), + ), + u32x4x2::new( + u32x4::const_new(49161801, 40681915, 67892, 25454357), + u32x4::const_new(22779677, 25798439, 15964829, 5863227), + ), + u32x4x2::new( + u32x4::const_new(60810637, 4496471, 5217137, 14095116), + u32x4::const_new(50942411, 50712663, 2507380, 26844507), + ), + u32x4x2::new( + u32x4::const_new(34579752, 53519385, 10859797, 18816024), + u32x4::const_new(42552864, 39478521, 6783896, 17277037), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(43287109, 27900723, 33182187, 2766754), + u32x4::const_new(17041989, 1018260, 33392790, 4830032), + ), + u32x4x2::new( + u32x4::const_new(60194178, 30788903, 24728888, 14513195), + u32x4::const_new(20897010, 28843233, 20111980, 17475240), + ), + u32x4x2::new( + u32x4::const_new(46042274, 19257042, 4628173, 31649727), + u32x4::const_new(27388316, 66631493, 11541886, 6408028), + ), + u32x4x2::new( + u32x4::const_new(57024680, 49536568, 32050358, 31321917), + u32x4::const_new(17437691, 49672356, 2884755, 20493991), + ), + u32x4x2::new( + u32x4::const_new(59553007, 46782643, 29001173, 1814088), + u32x4::const_new(21930692, 51319706, 14965872, 30748046), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(16441817, 36111849, 6900424, 602234), + u32x4::const_new(46522199, 16441484, 8135070, 21726541), + ), + u32x4x2::new( + u32x4::const_new(37711225, 32701959, 11679112, 13125533), + u32x4::const_new(32154135, 9407918, 26554289, 620848), + ), + u32x4x2::new( + u32x4::const_new(19233407, 30086864, 14679568, 2797374), + u32x4::const_new(4892806, 7993077, 247658, 5632804), + ), + u32x4x2::new( + u32x4::const_new(37427262, 26675495, 27125659, 13496131), + u32x4::const_new(50718473, 40115609, 28505351, 27837393), + ), + u32x4x2::new( + u32x4::const_new(196819, 18410429, 7070012, 21691388), + u32x4::const_new(29763371, 24754123, 9727048, 10930179), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(28319289, 40734650, 16225680, 24739184), + u32x4::const_new(64272368, 35356897, 7866648, 13635853), + ), + u32x4x2::new( + u32x4::const_new(34165295, 48328447, 27041670, 23643655), + u32x4::const_new(48949950, 52963288, 30411133, 6045174), + ), + u32x4x2::new( + u32x4::const_new(18583559, 41649834, 9813585, 26098520), + u32x4::const_new(25682734, 26733526, 19276490, 10654728), + ), + u32x4x2::new( + u32x4::const_new(34867476, 52715968, 5694571, 13380978), + u32x4::const_new(15134994, 1831255, 8608001, 17266401), + ), + u32x4x2::new( + u32x4::const_new(59925903, 44282172, 27802465, 1855069), + u32x4::const_new(14234749, 36635487, 11302294, 10938429), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(8373273, 49064494, 4932071, 32997499), + u32x4::const_new(38472880, 29335908, 14504412, 22460029), + ), + u32x4x2::new( + u32x4::const_new(31795930, 50785923, 25835990, 25790073), + u32x4::const_new(65669841, 11360450, 9969157, 9008164), + ), + u32x4x2::new( + u32x4::const_new(50262498, 45869261, 16124434, 15336007), + u32x4::const_new(882762, 42522623, 11277198, 26296377), + ), + u32x4x2::new( + u32x4::const_new(42332732, 59129236, 14452816, 567985), + u32x4::const_new(208061, 34722729, 32008143, 14828749), + ), + u32x4x2::new( + u32x4::const_new(17937794, 36846032, 32102665, 4442466), + u32x4::const_new(19745435, 31633451, 7146411, 15812027), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(30741269, 38648744, 12562645, 30092623), + u32x4::const_new(25073992, 28730659, 27911745, 30000958), + ), + u32x4x2::new( + u32x4::const_new(2859794, 25991700, 17776078, 27091930), + u32x4::const_new(2328322, 60061146, 18581824, 18039008), + ), + u32x4x2::new( + u32x4::const_new(58206333, 17917354, 1972306, 11853766), + u32x4::const_new(2655376, 60543390, 18416710, 13287440), + ), + u32x4x2::new( + u32x4::const_new(62746330, 61423885, 21246577, 2266675), + u32x4::const_new(60099139, 14804707, 14772234, 20679434), + ), + u32x4x2::new( + u32x4::const_new(26987698, 15488817, 715616, 2339565), + u32x4::const_new(51980752, 17333865, 21965103, 10839820), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(18672548, 57660959, 16042910, 19519287), + u32x4::const_new(62865851, 17580961, 26628347, 23774759), + ), + u32x4x2::new( + u32x4::const_new(368070, 3464471, 25888304, 30370559), + u32x4::const_new(52396053, 45426828, 28745251, 9246829), + ), + u32x4x2::new( + u32x4::const_new(29090099, 57950037, 23104657, 4903923), + u32x4::const_new(10987778, 56163684, 23621539, 10332760), + ), + u32x4x2::new( + u32x4::const_new(53338235, 44851161, 21606845, 31069622), + u32x4::const_new(4243630, 34464392, 11286454, 5802022), + ), + u32x4x2::new( + u32x4::const_new(46710757, 63389067, 11642865, 1980986), + u32x4::const_new(12967337, 28162061, 3854192, 30432268), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(12179834, 41005450, 12809619, 33525228), + u32x4::const_new(4624405, 46957889, 16968743, 11827816), + ), + u32x4x2::new( + u32x4::const_new(51521162, 12466775, 31791271, 15303651), + u32x4::const_new(49798465, 62714504, 6509600, 12918560), + ), + u32x4x2::new( + u32x4::const_new(20445559, 1756449, 28848701, 7920171), + u32x4::const_new(9835040, 5900071, 28757409, 12376688), + ), + u32x4x2::new( + u32x4::const_new(18259496, 14281012, 21767026, 10232236), + u32x4::const_new(20000226, 12400540, 4104902, 23570543), + ), + u32x4x2::new( + u32x4::const_new(3687440, 26546648, 13328821, 26841081), + u32x4::const_new(49822734, 22334054, 244496, 24862543), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(59523541, 62195428, 3853227, 13954801), + u32x4::const_new(12387708, 47627615, 27221350, 17899572), + ), + u32x4x2::new( + u32x4::const_new(63193587, 36343307, 14595132, 6880795), + u32x4::const_new(1364792, 37648434, 3259017, 20536046), + ), + u32x4x2::new( + u32x4::const_new(30362834, 10440372, 9574624, 11729232), + u32x4::const_new(63861613, 21748389, 5530846, 2721586), + ), + u32x4x2::new( + u32x4::const_new(18339760, 1550632, 17170271, 25732971), + u32x4::const_new(28459263, 63142237, 21642345, 31557672), + ), + u32x4x2::new( + u32x4::const_new(10611282, 5204623, 18049257, 214175), + u32x4::const_new(19432723, 49809070, 26010406, 27449522), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(19770733, 26478685, 9464541, 29158041), + u32x4::const_new(28604307, 45196604, 7586524, 6641859), + ), + u32x4x2::new( + u32x4::const_new(65654484, 52230498, 30886612, 19112823), + u32x4::const_new(47271809, 38942611, 16020035, 10773481), + ), + u32x4x2::new( + u32x4::const_new(27464323, 54451016, 20646645, 17732915), + u32x4::const_new(23008717, 53626684, 3253189, 15614410), + ), + u32x4x2::new( + u32x4::const_new(52381752, 40693008, 7063024, 28469981), + u32x4::const_new(51159478, 44543211, 19941777, 5985451), + ), + u32x4x2::new( + u32x4::const_new(13553668, 35524849, 14788737, 1883845), + u32x4::const_new(12385775, 47958835, 29135466, 1776722), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(36719806, 20827965, 23175373, 32996806), + u32x4::const_new(42041892, 65708790, 5467143, 20884008), + ), + u32x4x2::new( + u32x4::const_new(43256281, 40770646, 17244063, 31959819), + u32x4::const_new(64366384, 43544617, 25057754, 12628720), + ), + u32x4x2::new( + u32x4::const_new(17337782, 58472057, 27906934, 15305274), + u32x4::const_new(30292418, 39284317, 16946773, 24806712), + ), + u32x4x2::new( + u32x4::const_new(6485126, 32447403, 16261486, 13561940), + u32x4::const_new(49439635, 10738368, 16419889, 8897231), + ), + u32x4x2::new( + u32x4::const_new(44812203, 40122262, 25496058, 2759794), + u32x4::const_new(25295304, 52178368, 24154195, 29334408), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(42307254, 57217102, 1088936, 3832827), + u32x4::const_new(33905401, 23130334, 6958056, 12622851), + ), + u32x4x2::new( + u32x4::const_new(3881189, 14870059, 19712830, 6071598), + u32x4::const_new(38147944, 60776394, 3427938, 13765703), + ), + u32x4x2::new( + u32x4::const_new(7666911, 24227591, 17077136, 22967588), + u32x4::const_new(6874639, 30915523, 11451695, 24292224), + ), + u32x4x2::new( + u32x4::const_new(13659529, 31984463, 28764736, 20506164), + u32x4::const_new(64729627, 49321636, 28284636, 25472371), + ), + u32x4x2::new( + u32x4::const_new(39360308, 42281399, 9446504, 868960), + u32x4::const_new(49227724, 21351115, 30561851, 11292096), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(7071115, 46444090, 5387916, 15432877), + u32x4::const_new(27226682, 41506862, 2398278, 3978240), + ), + u32x4x2::new( + u32x4::const_new(51009614, 54216973, 24368938, 31392616), + u32x4::const_new(38456150, 62313644, 6729154, 99724), + ), + u32x4x2::new( + u32x4::const_new(17474332, 62857913, 2619930, 30659308), + u32x4::const_new(18268181, 32809239, 22826292, 24561895), + ), + u32x4x2::new( + u32x4::const_new(38187020, 67003092, 14118280, 16500577), + u32x4::const_new(18808560, 64983716, 25712929, 32518261), + ), + u32x4x2::new( + u32x4::const_new(25735813, 62284262, 10824872, 20558596), + u32x4::const_new(48149681, 31162667, 22608274, 26285185), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(963440, 63742255, 10230323, 25515008), + u32x4::const_new(32506414, 6105697, 25980317, 24645129), + ), + u32x4x2::new( + u32x4::const_new(7162189, 8101249, 14679265, 33443386), + u32x4::const_new(2002396, 8541405, 19442276, 4795881), + ), + u32x4x2::new( + u32x4::const_new(8116694, 51463069, 4415528, 25599140), + u32x4::const_new(55805721, 39582709, 6719436, 30033839), + ), + u32x4x2::new( + u32x4::const_new(14468202, 42181869, 25188826, 9639755), + u32x4::const_new(47546189, 62711146, 32762447, 18338064), + ), + u32x4x2::new( + u32x4::const_new(33880058, 32810909, 8969931, 13095238), + u32x4::const_new(38360605, 40138517, 9246134, 4928058), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(63655588, 17883670, 9410246, 26162761), + u32x4::const_new(5000571, 7349225, 23785252, 32751089), + ), + u32x4x2::new( + u32x4::const_new(28568737, 10733123, 9342397, 21570673), + u32x4::const_new(54096560, 32467591, 20494687, 21511513), + ), + u32x4x2::new( + u32x4::const_new(47675157, 47932807, 29250946, 15672208), + u32x4::const_new(59760469, 9945465, 14939287, 18437405), + ), + u32x4x2::new( + u32x4::const_new(37985267, 8609815, 31573002, 3373596), + u32x4::const_new(47828883, 20834216, 13248616, 24154292), + ), + u32x4x2::new( + u32x4::const_new(5543543, 29553242, 3386453, 30501150), + u32x4::const_new(25058089, 15236571, 8814395, 32462955), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(39158670, 15322548, 20495103, 3312736), + u32x4::const_new(14557171, 12985179, 8044741, 3176899), + ), + u32x4x2::new( + u32x4::const_new(24673290, 29693310, 21412266, 18324699), + u32x4::const_new(2154518, 40329021, 17500543, 3954277), + ), + u32x4x2::new( + u32x4::const_new(36758685, 38738957, 165513, 14691866), + u32x4::const_new(3070475, 10424235, 17096536, 16896898), + ), + u32x4x2::new( + u32x4::const_new(59790459, 43094586, 8720681, 10423589), + u32x4::const_new(1122030, 31545615, 4463786, 31811293), + ), + u32x4x2::new( + u32x4::const_new(49778992, 60881044, 20509974, 5832494), + u32x4::const_new(64155961, 31483358, 4511231, 20307815), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(2863373, 40876242, 26865913, 24067353), + u32x4::const_new(15726407, 40919070, 12953902, 9931535), + ), + u32x4x2::new( + u32x4::const_new(60934877, 42512204, 21649141, 21945190), + u32x4::const_new(52211954, 60984193, 7046207, 5363493), + ), + u32x4x2::new( + u32x4::const_new(4205971, 64068464, 18197273, 7327176), + u32x4::const_new(51527794, 21166920, 20669933, 11828242), + ), + u32x4x2::new( + u32x4::const_new(59782815, 49617225, 15379924, 457923), + u32x4::const_new(9320508, 21498914, 3242540, 31563182), + ), + u32x4x2::new( + u32x4::const_new(27714753, 8664670, 3366162, 26338598), + u32x4::const_new(56775518, 25796006, 13129151, 21388876), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(59276548, 49972346, 16795002, 33455915), + u32x4::const_new(48430097, 53857205, 18627071, 32474471), + ), + u32x4x2::new( + u32x4::const_new(42160315, 50705892, 13530540, 28012698), + u32x4::const_new(19833221, 55886870, 20191784, 9644313), + ), + u32x4x2::new( + u32x4::const_new(20372416, 28414713, 24084234, 31804096), + u32x4::const_new(33815377, 36131001, 17251241, 18291088), + ), + u32x4x2::new( + u32x4::const_new(56234667, 14920441, 2033267, 29572003), + u32x4::const_new(1724043, 45519699, 17873735, 501988), + ), + u32x4x2::new( + u32x4::const_new(50031659, 31517850, 15697583, 1016845), + u32x4::const_new(43104661, 54769582, 8008601, 27257051), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(52951491, 66542164, 14853573, 30444631), + u32x4::const_new(12045973, 24321813, 16545674, 18160646), + ), + u32x4x2::new( + u32x4::const_new(60107911, 1126003, 5947677, 19486116), + u32x4::const_new(41119984, 30860440, 7935395, 13354438), + ), + u32x4x2::new( + u32x4::const_new(17841328, 11063269, 1664538, 26687568), + u32x4::const_new(6268968, 22280371, 17275484, 4523163), + ), + u32x4x2::new( + u32x4::const_new(15886041, 56799482, 15446552, 21712778), + u32x4::const_new(1005290, 17827215, 4978741, 6854882), + ), + u32x4x2::new( + u32x4::const_new(34319277, 47731002, 20321804, 28544575), + u32x4::const_new(29591814, 63376351, 24754545, 26001714), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(66783087, 5234346, 46102, 8566476), + u32x4::const_new(19947339, 20180418, 25398238, 3726678), + ), + u32x4x2::new( + u32x4::const_new(63890180, 46380965, 20674069, 5366544), + u32x4::const_new(59661487, 48406612, 31533614, 7071217), + ), + u32x4x2::new( + u32x4::const_new(13104676, 1406631, 24326736, 19854367), + u32x4::const_new(61039528, 11019904, 31967425, 19219275), + ), + u32x4x2::new( + u32x4::const_new(39003597, 30143957, 15351834, 8639435), + u32x4::const_new(57309582, 61436794, 15830475, 10090318), + ), + u32x4x2::new( + u32x4::const_new(45923044, 6700175, 99413, 21263025), + u32x4::const_new(23762647, 53905481, 6063914, 10065424), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(42822326, 57678669, 4052879, 25452667), + u32x4::const_new(54049411, 2373092, 22337016, 7701046), + ), + u32x4x2::new( + u32x4::const_new(44382355, 43307377, 16761537, 30373573), + u32x4::const_new(49790216, 23230748, 25655306, 10519391), + ), + u32x4x2::new( + u32x4::const_new(919475, 59371245, 1273450, 25558666), + u32x4::const_new(9724711, 8556709, 25755845, 10887647), + ), + u32x4x2::new( + u32x4::const_new(25465699, 44651158, 17658392, 11257418), + u32x4::const_new(29735193, 22885150, 7094716, 26828565), + ), + u32x4x2::new( + u32x4::const_new(48237389, 47661599, 27054393, 7328070), + u32x4::const_new(27280193, 65616691, 23062005, 4170709), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(26535281, 60238317, 30343788, 25790743), + u32x4::const_new(37993933, 24614372, 9523840, 10401918), + ), + u32x4x2::new( + u32x4::const_new(2783987, 29468958, 4697011, 19804475), + u32x4::const_new(37246678, 46797720, 10261254, 18942252), + ), + u32x4x2::new( + u32x4::const_new(58135580, 60247753, 25301938, 6844561), + u32x4::const_new(20949454, 39844754, 4552026, 919057), + ), + u32x4x2::new( + u32x4::const_new(6694071, 44126261, 32285330, 31370180), + u32x4::const_new(24603698, 53328179, 13971149, 5325636), + ), + u32x4x2::new( + u32x4::const_new(64879487, 582094, 17982081, 19190425), + u32x4::const_new(24951286, 26923842, 29077174, 33286062), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(54863941, 67016431, 1224043, 23371240), + u32x4::const_new(62940074, 52101083, 13523637, 30366406), + ), + u32x4x2::new( + u32x4::const_new(36324581, 25407485, 18258623, 4698602), + u32x4::const_new(50300544, 2658516, 26300935, 2611030), + ), + u32x4x2::new( + u32x4::const_new(27183975, 21791014, 18105064, 9875199), + u32x4::const_new(58118912, 54198635, 6400311, 14767984), + ), + u32x4x2::new( + u32x4::const_new(33918318, 42937962, 14809334, 22136592), + u32x4::const_new(10636588, 29082337, 29829692, 28549776), + ), + u32x4x2::new( + u32x4::const_new(61080905, 854212, 12202487, 20004503), + u32x4::const_new(9256495, 6903981, 20567109, 347423), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(41391822, 34336880, 22362564, 14247996), + u32x4::const_new(12115604, 41583344, 7639288, 28910945), + ), + u32x4x2::new( + u32x4::const_new(62066617, 59758859, 26665947, 11614812), + u32x4::const_new(65737664, 45704543, 30324810, 12868376), + ), + u32x4x2::new( + u32x4::const_new(17491771, 43589814, 9454919, 26047850), + u32x4::const_new(52629282, 39304244, 3868968, 19296062), + ), + u32x4x2::new( + u32x4::const_new(17826638, 30413590, 32534225, 32741469), + u32x4::const_new(15012391, 14365713, 33039233, 14791399), + ), + u32x4x2::new( + u32x4::const_new(64115596, 59197067, 32739005, 23275744), + u32x4::const_new(32954320, 22241406, 20788442, 4942942), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(31956192, 59570132, 2784352, 4237732), + u32x4::const_new(47222312, 4860927, 18658867, 15279314), + ), + u32x4x2::new( + u32x4::const_new(63240583, 28160478, 23524941, 13390861), + u32x4::const_new(66437406, 57718120, 33345312, 28896298), + ), + u32x4x2::new( + u32x4::const_new(39026193, 46239965, 21440243, 25070488), + u32x4::const_new(64012383, 60999016, 16517060, 29565907), + ), + u32x4x2::new( + u32x4::const_new(18118181, 60161496, 4212092, 23976240), + u32x4::const_new(36277753, 62363144, 5816868, 16964362), + ), + u32x4x2::new( + u32x4::const_new(18196138, 62490693, 281468, 7934713), + u32x4::const_new(56027312, 62015725, 4837237, 32932252), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(29885826, 51028067, 30418143, 33438769), + u32x4::const_new(62542283, 39442528, 31535876, 143299), + ), + u32x4x2::new( + u32x4::const_new(17143063, 56709783, 14451852, 15782104), + u32x4::const_new(32762665, 14047066, 26295037, 5432487), + ), + u32x4x2::new( + u32x4::const_new(75151, 533606, 7539077, 30926189), + u32x4::const_new(38410914, 23771680, 4872443, 29199566), + ), + u32x4x2::new( + u32x4::const_new(61522396, 48934708, 16223126, 207380), + u32x4::const_new(11171993, 47975147, 14164574, 352966), + ), + u32x4x2::new( + u32x4::const_new(15449006, 56530757, 26796528, 12045834), + u32x4::const_new(63738697, 40667227, 33001582, 9101885), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(43331297, 18431341, 25801195, 17267698), + u32x4::const_new(19365485, 57295202, 22218985, 21284590), + ), + u32x4x2::new( + u32x4::const_new(2429849, 19152559, 10762172, 22564684), + u32x4::const_new(21880390, 66866426, 20357935, 22641906), + ), + u32x4x2::new( + u32x4::const_new(19771185, 31652693, 3666117, 28136958), + u32x4::const_new(23624283, 55101502, 6313920, 6783662), + ), + u32x4x2::new( + u32x4::const_new(3487137, 7092443, 11001876, 26196524), + u32x4::const_new(47319246, 44542068, 17594073, 15027760), + ), + u32x4x2::new( + u32x4::const_new(49563607, 32191113, 4991283, 25400512), + u32x4::const_new(46539152, 4155103, 32368171, 201203), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(20548943, 14334571, 4073874, 6368588), + u32x4::const_new(53208883, 56484515, 15970071, 25561889), + ), + u32x4x2::new( + u32x4::const_new(49915097, 44030795, 11202344, 29284344), + u32x4::const_new(60258023, 66225712, 8075764, 12383512), + ), + u32x4x2::new( + u32x4::const_new(45248912, 4933668, 9592153, 5819559), + u32x4::const_new(31030983, 38174071, 32435814, 7442522), + ), + u32x4x2::new( + u32x4::const_new(62688129, 48218381, 22089545, 12897361), + u32x4::const_new(21050881, 34278889, 7569163, 3225449), + ), + u32x4x2::new( + u32x4::const_new(19050183, 51089071, 32935757, 22640195), + u32x4::const_new(66122318, 47144608, 18743677, 25177079), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(41186817, 46681702, 31819867, 32997133), + u32x4::const_new(38559207, 27147015, 30293819, 16762988), + ), + u32x4x2::new( + u32x4::const_new(24154689, 51762873, 23883879, 13510519), + u32x4::const_new(55338250, 61224161, 11663149, 30803960), + ), + u32x4x2::new( + u32x4::const_new(18104238, 14117824, 11724021, 21362053), + u32x4::const_new(65704761, 35530242, 13498058, 33522849), + ), + u32x4x2::new( + u32x4::const_new(63812888, 23995539, 28920539, 24005193), + u32x4::const_new(26412223, 36582218, 4251418, 26160309), + ), + u32x4x2::new( + u32x4::const_new(16822053, 66064082, 3482145, 31979593), + u32x4::const_new(45937188, 54475379, 612917, 7976478), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(46509314, 55327128, 8944536, 274914), + u32x4::const_new(26432930, 53829300, 21192572, 3569894), + ), + u32x4x2::new( + u32x4::const_new(20919764, 64356651, 30642344, 17215170), + u32x4::const_new(20335124, 11203745, 18663316, 19024174), + ), + u32x4x2::new( + u32x4::const_new(59297055, 53842463, 3680204, 9806710), + u32x4::const_new(54004169, 51484914, 29807998, 20134199), + ), + u32x4x2::new( + u32x4::const_new(14781592, 22628010, 26877930, 25880359), + u32x4::const_new(30434803, 190607, 30184292, 8991040), + ), + u32x4x2::new( + u32x4::const_new(64400983, 64591751, 854562, 28216111), + u32x4::const_new(20010398, 50414793, 9803872, 22687008), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(15091184, 32550863, 8818643, 4244752), + u32x4::const_new(43123513, 64565526, 408838, 13206998), + ), + u32x4x2::new( + u32x4::const_new(16405061, 60379639, 31489017, 20949281), + u32x4::const_new(27568751, 38734986, 8364264, 12451020), + ), + u32x4x2::new( + u32x4::const_new(16005217, 58008076, 1406778, 26546927), + u32x4::const_new(39571784, 56365493, 31274296, 8918790), + ), + u32x4x2::new( + u32x4::const_new(23271122, 19453469, 27718201, 32742670), + u32x4::const_new(234332, 36785342, 22601675, 14331046), + ), + u32x4x2::new( + u32x4::const_new(40636025, 22442705, 22115403, 23745859), + u32x4::const_new(41164945, 61012, 12499614, 542137), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(62776018, 32835413, 17373246, 17187309), + u32x4::const_new(54469193, 21770290, 15923753, 28996575), + ), + u32x4x2::new( + u32x4::const_new(59385210, 63082298, 12568449, 8509004), + u32x4::const_new(9483342, 16105238, 5756054, 26890758), + ), + u32x4x2::new( + u32x4::const_new(53987996, 38201748, 5521661, 19060159), + u32x4::const_new(18663191, 9093637, 27786835, 31189196), + ), + u32x4x2::new( + u32x4::const_new(65872678, 43635130, 27903055, 25020300), + u32x4::const_new(65772737, 38110437, 5213502, 21909342), + ), + u32x4x2::new( + u32x4::const_new(4438979, 9680838, 10212446, 4764184), + u32x4::const_new(13235684, 58245995, 20264570, 21024049), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(60835961, 48209103, 31049052, 4688268), + u32x4::const_new(12426713, 59829045, 22302488, 29008521), + ), + u32x4x2::new( + u32x4::const_new(50401667, 29716596, 23531224, 7581281), + u32x4::const_new(49071895, 6952617, 14934683, 8218256), + ), + u32x4x2::new( + u32x4::const_new(1601446, 36631413, 31774811, 29625330), + u32x4::const_new(56786114, 8331539, 23129509, 19783344), + ), + u32x4x2::new( + u32x4::const_new(59514327, 64513110, 1772300, 5701338), + u32x4::const_new(5737511, 16147555, 9461515, 5703271), + ), + u32x4x2::new( + u32x4::const_new(33072974, 54300426, 11940114, 1308663), + u32x4::const_new(15627555, 4931627, 28443714, 20924342), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(18135013, 20358426, 4922557, 10015355), + u32x4::const_new(65729669, 34786528, 26248549, 29194359), + ), + u32x4x2::new( + u32x4::const_new(797666, 34997544, 24316856, 25107230), + u32x4::const_new(24612576, 4761401, 15307321, 32404252), + ), + u32x4x2::new( + u32x4::const_new(16501152, 60565831, 9487105, 9316022), + u32x4::const_new(24986054, 31917592, 3962024, 2501883), + ), + u32x4x2::new( + u32x4::const_new(63356796, 50432342, 18044926, 30566881), + u32x4::const_new(42032028, 31415202, 13524600, 16119907), + ), + u32x4x2::new( + u32x4::const_new(3927286, 57022374, 9265437, 21620772), + u32x4::const_new(19481940, 3806938, 24836192, 14572399), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(10785787, 46564798, 368445, 33181384), + u32x4::const_new(5319843, 52687136, 30347110, 29837357), + ), + u32x4x2::new( + u32x4::const_new(56436732, 47859251, 24141084, 22250712), + u32x4::const_new(59046084, 4963427, 33463413, 17168859), + ), + u32x4x2::new( + u32x4::const_new(15512044, 6366740, 4737504, 27644548), + u32x4::const_new(30307977, 25037929, 14593903, 12836490), + ), + u32x4x2::new( + u32x4::const_new(63878897, 34013023, 5860752, 7244096), + u32x4::const_new(3689461, 57012135, 18389096, 11589351), + ), + u32x4x2::new( + u32x4::const_new(4682110, 36302830, 653422, 22316819), + u32x4::const_new(14081831, 5657024, 11088376, 24110612), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(39907267, 45940262, 24887471, 18342609), + u32x4::const_new(878445, 40456159, 12019082, 345107), + ), + u32x4x2::new( + u32x4::const_new(12794982, 28893944, 9447505, 11387200), + u32x4::const_new(16961963, 13916996, 10893728, 25898006), + ), + u32x4x2::new( + u32x4::const_new(44934162, 53465865, 3583620, 1102334), + u32x4::const_new(53917811, 63478576, 2426066, 10389549), + ), + u32x4x2::new( + u32x4::const_new(45096036, 37595344, 19367718, 20257175), + u32x4::const_new(10280866, 41653449, 27665642, 375926), + ), + u32x4x2::new( + u32x4::const_new(45847901, 24064074, 32494820, 32204556), + u32x4::const_new(10720704, 51079060, 1297436, 29853825), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(66303987, 36060363, 16494578, 24962147), + u32x4::const_new(11971403, 49538586, 25060560, 1964341), + ), + u32x4x2::new( + u32x4::const_new(25988481, 27641502, 24909517, 27237087), + u32x4::const_new(66646363, 52777626, 16360849, 10459972), + ), + u32x4x2::new( + u32x4::const_new(43930529, 34374176, 31225968, 8807030), + u32x4::const_new(10394758, 35904854, 25325589, 19335583), + ), + u32x4x2::new( + u32x4::const_new(25094697, 34380951, 20051185, 32287161), + u32x4::const_new(11739332, 53887441, 30517319, 26601892), + ), + u32x4x2::new( + u32x4::const_new(8868546, 35635502, 32513071, 28248087), + u32x4::const_new(51946989, 14222744, 19198839, 23261841), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(51218008, 5070126, 11046681, 5320810), + u32x4::const_new(61212079, 34104447, 23895089, 6460727), + ), + u32x4x2::new( + u32x4::const_new(39843528, 46278671, 10426120, 25624792), + u32x4::const_new(66658766, 37140083, 28933107, 12969597), + ), + u32x4x2::new( + u32x4::const_new(59635793, 40220191, 5751421, 173680), + u32x4::const_new(58321825, 740337, 1412847, 7682623), + ), + u32x4x2::new( + u32x4::const_new(975962, 56440763, 20812276, 22631115), + u32x4::const_new(49095824, 19883130, 2419746, 31043648), + ), + u32x4x2::new( + u32x4::const_new(66208703, 39669328, 22525915, 3748897), + u32x4::const_new(65994776, 34533552, 8126286, 18326047), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(64176557, 3912400, 19351673, 30068471), + u32x4::const_new(31190055, 24221683, 33142424, 28698542), + ), + u32x4x2::new( + u32x4::const_new(34784792, 4109933, 3867193, 19557314), + u32x4::const_new(2112512, 32715890, 24550117, 16595976), + ), + u32x4x2::new( + u32x4::const_new(35542761, 48024875, 10925431, 31526577), + u32x4::const_new(66577735, 23189821, 13375709, 1735095), + ), + u32x4x2::new( + u32x4::const_new(59699254, 43854093, 29783239, 24777271), + u32x4::const_new(19600372, 39924461, 2896720, 1472185), + ), + u32x4x2::new( + u32x4::const_new(56389656, 35980854, 33172342, 1370336), + u32x4::const_new(23707480, 57654949, 7850973, 12655016), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(38372660, 57101970, 7044964, 12732710), + u32x4::const_new(57535705, 6043201, 30858914, 10946592), + ), + u32x4x2::new( + u32x4::const_new(21023468, 6946992, 26403324, 23901823), + u32x4::const_new(35695559, 23440687, 4763891, 6514074), + ), + u32x4x2::new( + u32x4::const_new(28662273, 30933699, 9352242, 26354829), + u32x4::const_new(37402243, 3145176, 8770289, 525937), + ), + u32x4x2::new( + u32x4::const_new(54933102, 36695832, 3281859, 4755022), + u32x4::const_new(23043294, 32794379, 15618886, 23602412), + ), + u32x4x2::new( + u32x4::const_new(9931565, 29897140, 2480737, 24193701), + u32x4::const_new(7833615, 2284939, 893926, 13421882), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(22917795, 22088359, 28978099, 19794863), + u32x4::const_new(60542318, 29878494, 31053731, 9080720), + ), + u32x4x2::new( + u32x4::const_new(23679072, 52547035, 28424916, 20647332), + u32x4::const_new(4008761, 28267029, 12961289, 1589095), + ), + u32x4x2::new( + u32x4::const_new(55616194, 26678929, 14998265, 23274397), + u32x4::const_new(54625466, 46244264, 28627706, 33030665), + ), + u32x4x2::new( + u32x4::const_new(11527330, 6449415, 26531607, 3472938), + u32x4::const_new(41541592, 62607682, 19862690, 20564723), + ), + u32x4x2::new( + u32x4::const_new(32843805, 49066843, 28425824, 19521495), + u32x4::const_new(48792073, 48242878, 27392443, 13175986), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(16185025, 61537525, 2961305, 1492442), + u32x4::const_new(25123147, 3095034, 31896958, 33089615), + ), + u32x4x2::new( + u32x4::const_new(64748157, 18336595, 16522231, 25426312), + u32x4::const_new(65718949, 35485695, 30554083, 10205918), + ), + u32x4x2::new( + u32x4::const_new(39626934, 39271045, 16420458, 9826240), + u32x4::const_new(56483981, 27128085, 3783403, 13360006), + ), + u32x4x2::new( + u32x4::const_new(30793778, 66771960, 17241420, 6564573), + u32x4::const_new(61102581, 29974476, 32385512, 9011754), + ), + u32x4x2::new( + u32x4::const_new(28068166, 11862220, 14323567, 12380617), + u32x4::const_new(52090465, 16029056, 24495309, 21409233), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(59411973, 57437124, 11695483, 17586857), + u32x4::const_new(16108987, 43449109, 31098002, 6248476), + ), + u32x4x2::new( + u32x4::const_new(42258047, 61595931, 29308533, 11742653), + u32x4::const_new(43042345, 27373650, 30165249, 21929989), + ), + u32x4x2::new( + u32x4::const_new(49907221, 9620337, 21888081, 20981082), + u32x4::const_new(56288861, 61562203, 33223566, 3582446), + ), + u32x4x2::new( + u32x4::const_new(57535017, 41003416, 22080416, 14463796), + u32x4::const_new(65518565, 18127889, 24370863, 33332664), + ), + u32x4x2::new( + u32x4::const_new(66655380, 6430175, 471782, 11947673), + u32x4::const_new(30596400, 18898659, 15930721, 4211851), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(6757410, 65455566, 13584784, 11362173), + u32x4::const_new(10797127, 24451471, 19541370, 29309435), + ), + u32x4x2::new( + u32x4::const_new(40360156, 17685025, 18326181, 3846903), + u32x4::const_new(13693365, 63049479, 31900359, 23385063), + ), + u32x4x2::new( + u32x4::const_new(52455038, 57513503, 22163311, 27095042), + u32x4::const_new(48610726, 66454160, 12085341, 26357004), + ), + u32x4x2::new( + u32x4::const_new(22097042, 14063840, 6705778, 14342902), + u32x4::const_new(66139825, 20702105, 31279090, 7495745), + ), + u32x4x2::new( + u32x4::const_new(27360710, 49314837, 18774847, 7146436), + u32x4::const_new(37066216, 42004961, 22409916, 10524446), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(1497507, 33054449, 11839906, 2960428), + u32x4::const_new(40538463, 18884538, 25018820, 4073970), + ), + u32x4x2::new( + u32x4::const_new(54484385, 43640735, 2808257, 20710708), + u32x4::const_new(39840730, 27222424, 21783544, 11848522), + ), + u32x4x2::new( + u32x4::const_new(45765237, 48200555, 9299019, 9393151), + u32x4::const_new(34818188, 56098995, 13575233, 21012731), + ), + u32x4x2::new( + u32x4::const_new(4265428, 49627650, 24960282, 9425650), + u32x4::const_new(47883651, 2797524, 11853190, 22877329), + ), + u32x4x2::new( + u32x4::const_new(25008173, 64199503, 380047, 12107343), + u32x4::const_new(12329448, 11914399, 764281, 29687002), + ), + ])), + CachedPoint(FieldElement2625x4([ + u32x4x2::new( + u32x4::const_new(35889734, 23047226, 4022841, 7017445), + u32x4::const_new(7274086, 53316179, 25100176, 15310676), + ), + u32x4x2::new( + u32x4::const_new(42409427, 30270106, 6823853, 31551384), + u32x4::const_new(40645017, 66489807, 18021817, 32669351), + ), + u32x4x2::new( + u32x4::const_new(39827134, 43680850, 28297996, 20258133), + u32x4::const_new(26058742, 52643238, 22238331, 21690533), + ), + u32x4x2::new( + u32x4::const_new(60808002, 17499995, 30042246, 29310584), + u32x4::const_new(48219954, 29389518, 8680514, 17844709), + ), + u32x4x2::new( + u32x4::const_new(6452896, 50116553, 9532047, 26821214), + u32x4::const_new(44524351, 50428429, 21904953, 12608048), + ), + ])), +]); diff --git a/curve25519-dalek/src/backend/vector/neon/edwards.rs b/curve25519-dalek/src/backend/vector/neon/edwards.rs new file mode 100644 index 000000000..ac773ec6a --- /dev/null +++ b/curve25519-dalek/src/backend/vector/neon/edwards.rs @@ -0,0 +1,566 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence +// 2021-2022 Robrecht Blancquaert +// See LICENSE for licensing information. +// +// Authors: +// - Isis Agora Lovecruft +// - Henry de Valence +// - Robrecht Blancquaert + +//! Parallel Edwards Arithmetic for Curve25519. +//! +//! This module currently has two point types: +//! +//! * `ExtendedPoint`: a point stored in vector-friendly format, with +//! vectorized doubling and addition; +//! +//! * `CachedPoint`: used for readdition. +//! +//! Details on the formulas can be found in the documentation for the +//! `avx2` module. +//! +//! Similar to field, code for this was mostly copied from avx2 module. + +#![allow(non_snake_case)] + +use core::convert::From; +use core::ops::{Add, Neg, Sub}; + +use curve25519_dalek_derive::unsafe_target_feature; +use subtle::Choice; +use subtle::ConditionallySelectable; + +use crate::edwards; +use crate::window::{LookupTable, NafLookupTable5, NafLookupTable8}; + +use crate::traits::Identity; + +use super::constants; +use super::field::{FieldElement2625x4, Lanes, Shuffle}; + +/// A point on Curve25519, using parallel Edwards formulas for curve +/// operations. +/// +/// # Invariant +/// +/// The coefficients of an `ExtendedPoint` are bounded with +/// \\( b < 0.007 \\). +#[derive(Copy, Clone, Debug)] +pub struct ExtendedPoint(pub(super) FieldElement2625x4); + +#[unsafe_target_feature("neon")] +impl From for ExtendedPoint { + fn from(P: edwards::EdwardsPoint) -> ExtendedPoint { + ExtendedPoint(FieldElement2625x4::new(&P.X, &P.Y, &P.Z, &P.T)) + } +} + +#[unsafe_target_feature("neon")] +impl From for edwards::EdwardsPoint { + fn from(P: ExtendedPoint) -> edwards::EdwardsPoint { + let tmp = P.0.split(); + edwards::EdwardsPoint { + X: tmp[0], + Y: tmp[1], + Z: tmp[2], + T: tmp[3], + } + } +} + +#[unsafe_target_feature("neon")] +impl ConditionallySelectable for ExtendedPoint { + fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self { + ExtendedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice)) + } + + fn conditional_assign(&mut self, other: &Self, choice: Choice) { + self.0.conditional_assign(&other.0, choice); + } +} + +#[unsafe_target_feature("neon")] +impl Default for ExtendedPoint { + fn default() -> ExtendedPoint { + ExtendedPoint::identity() + } +} + +#[unsafe_target_feature("neon")] +impl Identity for ExtendedPoint { + fn identity() -> ExtendedPoint { + constants::EXTENDEDPOINT_IDENTITY + } +} + +#[unsafe_target_feature("neon")] +impl ExtendedPoint { + /// Compute the double of this point. + pub fn double(&self) -> ExtendedPoint { + // Want to compute (X1 Y1 Z1 X1+Y1). + // Not sure how to do this less expensively than computing + // (X1 Y1 Z1 T1) --(256bit shuffle)--> (X1 Y1 X1 Y1) + // (X1 Y1 X1 Y1) --(2x128b shuffle)--> (Y1 X1 Y1 X1) + // and then adding. + + // Set tmp0 = (X1 Y1 X1 Y1) + let mut tmp0 = self.0.shuffle(Shuffle::ABAB); + + // Set tmp1 = (Y1 X1 Y1 X1) + let mut tmp1 = tmp0.shuffle(Shuffle::BADC); + + // Set tmp0 = (X1 Y1 Z1 X1+Y1) + tmp0 = self.0.blend(tmp0 + tmp1, Lanes::D); + + // Set tmp1 = tmp0^2, negating the D values + tmp1 = tmp0.square_and_negate_D(); + // Now tmp1 = (S1 S2 S3 -S4) with b < 0.007 + + // See discussion of bounds in the module-level documentation. + // We want to compute + // + // + | S1 | S1 | S1 | S1 | + // + | S2 | | | S2 | + // + | | | S3 | | + // + | | | S3 | | + // + | | | |-S4 | + // + | | 2p | 2p | | + // - | | S2 | S2 | | + // ======================= + // S5 S6 S8 S9 + + let zero = FieldElement2625x4::zero(); + let S_1 = tmp1.shuffle(Shuffle::AAAA); + let S_2 = tmp1.shuffle(Shuffle::BBBB); + + tmp0 = zero.blend(tmp1 + tmp1, Lanes::C); + // tmp0 = (0, 0, 2S3, 0) + tmp0 = tmp0.blend(tmp1, Lanes::D); + // tmp0 = (0, 0, 2S3, -S4) + tmp0 = tmp0 + S_1; + // tmp0 = ( S1, S1, S1 + 2S3, S1 - S4) + tmp0 = tmp0 + zero.blend(S_2, Lanes::AD); + // tmp0 = (S1 + S2, S1, S1 + 2S3, S1 + S2 - S4) + tmp0 = tmp0 + zero.blend(S_2.negate_lazy(), Lanes::BC); + // tmp0 = (S1 + S2, S1 - S2, S1 - S2 + 2S3, S1 + S2 - S4) + // b < ( 1.01, 1.6, 2.33, 1.6) + // Now tmp0 = (S5, S6, S8, S9) + + // Set tmp1 = ( S9, S6, S6, S9) + // b < ( 1.6, 1.6, 1.6, 1.6) + tmp1 = tmp0.shuffle(Shuffle::DBBD); + // Set tmp0 = ( S8, S5, S8, S5) + // b < (2.33, 1.01, 2.33, 1.01) + tmp0 = tmp0.shuffle(Shuffle::CACA); + + // Bounds on (tmp0, tmp1) are (2.33, 1.6) < (2.5, 1.75). + ExtendedPoint(&tmp0 * &tmp1) + } + + pub fn mul_by_pow_2(&self, k: u32) -> ExtendedPoint { + let mut tmp: ExtendedPoint = *self; + for _ in 0..k { + tmp = tmp.double(); + } + tmp + } +} + +/// A cached point with some precomputed variables used for readdition. +/// +/// # Warning +/// +/// It is not safe to negate this point more than once. +/// +/// # Invariant +/// +/// As long as the `CachedPoint` is not repeatedly negated, its +/// coefficients will be bounded with \\( b < 1.0 \\). +#[derive(Copy, Clone, Debug)] +pub struct CachedPoint(pub(super) FieldElement2625x4); + +#[unsafe_target_feature("neon")] +impl From for CachedPoint { + fn from(P: ExtendedPoint) -> CachedPoint { + let mut x = P.0; + + x = x.blend(x.diff_sum(), Lanes::AB); + // x = (Y2 - X2, Y2 + X2, Z2, T2) = (S2 S3 Z2 T2) + + x = x * (121666, 121666, 2 * 121666, 2 * 121665); + // x = (121666*S2 121666*S3 2*121666*Z2 2*121665*T2) + + x = x.blend(-x, Lanes::D); + // x = (121666*S2 121666*S3 2*121666*Z2 -2*121665*T2) + + // The coefficients of the output are bounded with b < 0.007. + CachedPoint(x) + } +} + +#[unsafe_target_feature("neon")] +impl Default for CachedPoint { + fn default() -> CachedPoint { + CachedPoint::identity() + } +} + +#[unsafe_target_feature("neon")] +impl Identity for CachedPoint { + fn identity() -> CachedPoint { + constants::CACHEDPOINT_IDENTITY + } +} + +#[unsafe_target_feature("neon")] +impl ConditionallySelectable for CachedPoint { + fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self { + CachedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice)) + } + + fn conditional_assign(&mut self, other: &Self, choice: Choice) { + self.0.conditional_assign(&other.0, choice); + } +} + +#[unsafe_target_feature("neon")] +impl<'a> Neg for &'a CachedPoint { + type Output = CachedPoint; + /// Lazily negate the point. + /// + /// # Warning + /// + /// Because this method does not perform a reduction, it is not + /// safe to repeatedly negate a point. + fn neg(self) -> CachedPoint { + let swapped = self.0.shuffle(Shuffle::BACD); + CachedPoint(swapped.blend(swapped.negate_lazy(), Lanes::D)) + } +} + +#[unsafe_target_feature("neon")] +impl<'a, 'b> Add<&'b CachedPoint> for &'a ExtendedPoint { + type Output = ExtendedPoint; + + /// Add an `ExtendedPoint` and a `CachedPoint`. + fn add(self, other: &'b CachedPoint) -> ExtendedPoint { + // The coefficients of an `ExtendedPoint` are reduced after + // every operation. If the `CachedPoint` was negated, its + // coefficients grow by one bit. So on input, `self` is + // bounded with `b < 0.007` and `other` is bounded with + // `b < 1.0`. + + let mut tmp = self.0; + + tmp = tmp.blend(tmp.diff_sum(), Lanes::AB); + // tmp = (Y1-X1 Y1+X1 Z1 T1) = (S0 S1 Z1 T1) with b < 1.6 + + // (tmp, other) bounded with b < (1.6, 1.0) < (2.5, 1.75). + tmp = &tmp * &other.0; + // tmp = (S0*S2' S1*S3' Z1*Z2' T1*T2') = (S8 S9 S10 S11) + + tmp = tmp.shuffle(Shuffle::ABDC); + // tmp = (S8 S9 S11 S10) + + tmp = tmp.diff_sum(); + // tmp = (S9-S8 S9+S8 S10-S11 S10+S11) = (S12 S13 S14 S15) + + let t0 = tmp.shuffle(Shuffle::ADDA); + // t0 = (S12 S15 S15 S12) + let t1 = tmp.shuffle(Shuffle::CBCB); + // t1 = (S14 S13 S14 S13) + + // All coefficients of t0, t1 are bounded with b < 1.6. + // Return (S12*S14 S15*S13 S15*S14 S12*S13) = (X3 Y3 Z3 T3) + ExtendedPoint(&t0 * &t1) + } +} + +#[unsafe_target_feature("neon")] +impl<'a, 'b> Sub<&'b CachedPoint> for &'a ExtendedPoint { + type Output = ExtendedPoint; + + /// Implement subtraction by negating the point and adding. + /// + /// Empirically, this seems about the same cost as a custom + /// subtraction impl (maybe because the benefit is cancelled by + /// increased code size?) + fn sub(self, other: &'b CachedPoint) -> ExtendedPoint { + self + &(-other) + } +} + +#[unsafe_target_feature("neon")] +impl<'a> From<&'a edwards::EdwardsPoint> for LookupTable { + fn from(point: &'a edwards::EdwardsPoint) -> Self { + let P = ExtendedPoint::from(*point); + let mut points = [CachedPoint::from(P); 8]; + for i in 0..7 { + points[i + 1] = (&P + &points[i]).into(); + } + LookupTable(points) + } +} + +#[unsafe_target_feature("neon")] +impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable5 { + fn from(point: &'a edwards::EdwardsPoint) -> Self { + let A = ExtendedPoint::from(*point); + let mut Ai = [CachedPoint::from(A); 8]; + let A2 = A.double(); + for i in 0..7 { + Ai[i + 1] = (&A2 + &Ai[i]).into(); + } + // Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A] + NafLookupTable5(Ai) + } +} + +#[unsafe_target_feature("neon")] +impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable8 { + fn from(point: &'a edwards::EdwardsPoint) -> Self { + let A = ExtendedPoint::from(*point); + let mut Ai = [CachedPoint::from(A); 64]; + let A2 = A.double(); + for i in 0..63 { + Ai[i + 1] = (&A2 + &Ai[i]).into(); + } + // Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A] + NafLookupTable8(Ai) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[rustfmt::skip] // Skip S8..S11 formatting + fn serial_add(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) -> edwards::EdwardsPoint { + use crate::backend::serial::u64::field::FieldElement51; + + let (X1, Y1, Z1, T1) = (P.X, P.Y, P.Z, P.T); + let (X2, Y2, Z2, T2) = (Q.X, Q.Y, Q.Z, Q.T); + + macro_rules! print_var { + ($x:ident) => { + println!("{} = {:?}", stringify!($x), $x.as_bytes()); + }; + } + + let S0 = &Y1 - &X1; // R1 + let S1 = &Y1 + &X1; // R3 + let S2 = &Y2 - &X2; // R2 + let S3 = &Y2 + &X2; // R4 + print_var!(S0); + print_var!(S1); + print_var!(S2); + print_var!(S3); + println!(""); + + let S4 = &S0 * &S2; // R5 = R1 * R2 + let S5 = &S1 * &S3; // R6 = R3 * R4 + let S6 = &Z1 * &Z2; // R8 + let S7 = &T1 * &T2; // R7 + print_var!(S4); + print_var!(S5); + print_var!(S6); + print_var!(S7); + println!(""); + + let S8 = &S4 * &FieldElement51([ 121666,0,0,0,0]); // R5 + let S9 = &S5 * &FieldElement51([ 121666,0,0,0,0]); // R6 + let S10 = &S6 * &FieldElement51([2*121666,0,0,0,0]); // R8 + let S11 = &S7 * &(-&FieldElement51([2*121665,0,0,0,0])); // R7 + print_var!(S8); + print_var!(S9); + print_var!(S10); + print_var!(S11); + println!(""); + + let S12 = &S9 - &S8; // R1 + let S13 = &S9 + &S8; // R4 + let S14 = &S10 - &S11; // R2 + let S15 = &S10 + &S11; // R3 + print_var!(S12); + print_var!(S13); + print_var!(S14); + print_var!(S15); + println!(""); + + let X3 = &S12 * &S14; // R1 * R2 + let Y3 = &S15 * &S13; // R3 * R4 + let Z3 = &S15 * &S14; // R2 * R3 + let T3 = &S12 * &S13; // R1 * R4 + + edwards::EdwardsPoint { + X: X3, + Y: Y3, + Z: Z3, + T: T3, + } + } + + fn addition_test_helper(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) { + // Test the serial implementation of the parallel addition formulas + let R_serial: edwards::EdwardsPoint = serial_add(P.into(), Q.into()).into(); + + // Test the vector implementation of the parallel readdition formulas + let cached_Q = CachedPoint::from(ExtendedPoint::from(Q)); + let R_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) + &cached_Q).into(); + let S_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) - &cached_Q).into(); + + println!("Testing point addition:"); + println!("P = {:?}", P); + println!("Q = {:?}", Q); + println!("cached Q = {:?}", cached_Q); + println!("R = P + Q = {:?}", &P + &Q); + println!("R_serial = {:?}", R_serial); + println!("R_vector = {:?}", R_vector); + println!("S = P - Q = {:?}", &P - &Q); + println!("S_vector = {:?}", S_vector); + assert_eq!(R_serial.compress(), (&P + &Q).compress()); + assert_eq!(R_vector.compress(), (&P + &Q).compress()); + assert_eq!(S_vector.compress(), (&P - &Q).compress()); + println!("OK!\n"); + } + + #[test] + fn vector_addition_vs_serial_addition_vs_edwards_extendedpoint() { + use crate::constants; + use crate::scalar::Scalar; + + println!("Testing id +- id"); + let P = edwards::EdwardsPoint::identity(); + let Q = edwards::EdwardsPoint::identity(); + addition_test_helper(P, Q); + + println!("Testing id +- B"); + let P = edwards::EdwardsPoint::identity(); + let Q = constants::ED25519_BASEPOINT_POINT; + addition_test_helper(P, Q); + + println!("Testing B +- B"); + let P = constants::ED25519_BASEPOINT_POINT; + let Q = constants::ED25519_BASEPOINT_POINT; + addition_test_helper(P, Q); + + println!("Testing B +- kB"); + let P = constants::ED25519_BASEPOINT_POINT; + let Q = constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64); + addition_test_helper(P, Q); + } + + fn serial_double(P: edwards::EdwardsPoint) -> edwards::EdwardsPoint { + let (X1, Y1, Z1, _T1) = (P.X, P.Y, P.Z, P.T); + + macro_rules! print_var { + ($x:ident) => { + println!("{} = {:?}", stringify!($x), $x.as_bytes()); + }; + } + + let S0 = &X1 + &Y1; // R1 + print_var!(S0); + println!(""); + + let S1 = X1.square(); + let S2 = Y1.square(); + let S3 = Z1.square(); + let S4 = S0.square(); + print_var!(S1); + print_var!(S2); + print_var!(S3); + print_var!(S4); + println!(""); + + let S5 = &S1 + &S2; + let S6 = &S1 - &S2; + let S7 = &S3 + &S3; + let S8 = &S7 + &S6; + let S9 = &S5 - &S4; + print_var!(S5); + print_var!(S6); + print_var!(S7); + print_var!(S8); + print_var!(S9); + println!(""); + + let X3 = &S8 * &S9; + let Y3 = &S5 * &S6; + let Z3 = &S8 * &S6; + let T3 = &S5 * &S9; + + edwards::EdwardsPoint { + X: X3, + Y: Y3, + Z: Z3, + T: T3, + } + } + + fn doubling_test_helper(P: edwards::EdwardsPoint) { + let R1: edwards::EdwardsPoint = serial_double(P.into()).into(); + let R2: edwards::EdwardsPoint = ExtendedPoint::from(P).double().into(); + println!("Testing point doubling:"); + println!("P = {:?}", P); + println!("(serial) R1 = {:?}", R1); + println!("(vector) R2 = {:?}", R2); + println!("P + P = {:?}", &P + &P); + assert_eq!(R1.compress(), (&P + &P).compress()); + assert_eq!(R2.compress(), (&P + &P).compress()); + println!("OK!\n"); + } + + #[test] + fn vector_doubling_vs_serial_doubling_vs_edwards_extendedpoint() { + use crate::constants; + use crate::scalar::Scalar; + + println!("Testing [2]id"); + let P = edwards::EdwardsPoint::identity(); + doubling_test_helper(P); + + println!("Testing [2]B"); + let P = constants::ED25519_BASEPOINT_POINT; + doubling_test_helper(P); + + println!("Testing [2]([k]B)"); + let P = constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64); + doubling_test_helper(P); + } + + #[test] + fn basepoint_odd_lookup_table_verify() { + use crate::backend::vector::neon::constants::BASEPOINT_ODD_LOOKUP_TABLE; + use crate::constants; + + let basepoint_odd_table = + NafLookupTable8::::from(&constants::ED25519_BASEPOINT_POINT); + println!("Testing basepoint table"); + + let table_B = &BASEPOINT_ODD_LOOKUP_TABLE; + for (b_vec, base_vec) in table_B.0.iter().zip(basepoint_odd_table.0.iter()) { + println!("aa"); + let b_splits = b_vec.0.split(); + let base_splits = base_vec.0.split(); + + println!("{:?}", base_splits[0]); + println!("{:?}", base_splits[1]); + println!("{:?}", base_splits[2]); + println!("{:?}", base_splits[3]); + println!("----"); + println!("{:?}", b_splits[0]); + println!("{:?}", b_splits[1]); + println!("{:?}", b_splits[2]); + println!("{:?}", b_splits[3]); + + assert_eq!(base_splits[0], b_splits[0]); + assert_eq!(base_splits[1], b_splits[1]); + assert_eq!(base_splits[2], b_splits[2]); + assert_eq!(base_splits[3], b_splits[3]); + } + } +} diff --git a/curve25519-dalek/src/backend/vector/neon/field.rs b/curve25519-dalek/src/backend/vector/neon/field.rs new file mode 100644 index 000000000..fa951eb36 --- /dev/null +++ b/curve25519-dalek/src/backend/vector/neon/field.rs @@ -0,0 +1,856 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence +// 2021-2022 Robrecht Blancquaert +// See LICENSE for licensing information. +// +// Authors: +// - Isis Agora Lovecruft +// - Henry de Valence +// - Robrecht Blancquaert + +//! More details on the algorithms can be found in the `avx2` +//! module. Here comments are mostly added only when needed +//! to explain differenes between the 'base' avx2 version and +//! this re-implementation for arm neon. + +//! The most major difference is the split of one vector of 8 +//! limbs into to vectors holding 4 limbs each. For the rest +//! changes where made to account for different structure in +//! arm instructions. + +use core::ops::{Add, Mul, Neg}; + +use super::packed_simd::{i32x4, u32x2, u32x2x2, u32x4, u32x4x2, u64x2, u64x2x2}; +use crate::backend::serial::u64::field::FieldElement51; +use crate::backend::vector::neon::constants::{ + P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO, +}; + +#[cfg(target_arch = "aarch64")] +use core::arch::aarch64 as core_neon; +#[cfg(target_arch = "arm")] +use core::arch::arm as core_neon; + +use core_neon::{ + uint32x2_t, uint32x4_t, vcombine_u32, vmull_u32, vmulq_n_u32, vqshlq_u32, + vreinterpretq_u32_u64, vsubq_u64, vtrn1_u32, vuzp1_u32, +}; + +#[cfg(target_arch = "arm")] +use core::arch::arm::{vget_high_u32, vget_low_u32}; + +#[cfg(all(target_arch = "aarch64"))] +#[inline(always)] +fn vget_high_u32(v: uint32x4_t) -> uint32x2_t { + use core::arch::asm; + let o; + unsafe { + asm! ( + "DUP {o:d}, {v}.D[1]", + v = in(vreg) v, + o = out(vreg) o, + ) + } + o +} + +#[cfg(all(target_arch = "aarch64"))] +#[inline(always)] +fn vget_low_u32(v: uint32x4_t) -> uint32x2_t { + use core::arch::asm; + let o; + unsafe { + asm! ( + "DUP {o:d}, {v}.D[0]", + v = in(vreg) v, + o = out(vreg) o, + ) + } + o +} + +// Shuffle the lanes in a u32x4x2 +macro_rules! shuffle { + ($vec:expr , $index:expr) => { + unsafe { + let v_n: [u32; 8] = [ + $vec.extract::<0>(), + $vec.extract::<1>(), + $vec.extract::<2>(), + $vec.extract::<3>(), + $vec.extract::<4>(), + $vec.extract::<5>(), + $vec.extract::<6>(), + $vec.extract::<7>(), + ]; + u32x4x2::new( + core::mem::transmute::<[u32; 4], u32x4>([ + v_n[$index[0]], + v_n[$index[1]], + v_n[$index[2]], + v_n[$index[3]], + ]), + core::mem::transmute::<[u32; 4], u32x4>([ + v_n[$index[4]], + v_n[$index[5]], + v_n[$index[6]], + v_n[$index[7]], + ]), + ) + } + }; +} + +// Blend the lanes of two u32x4 +macro_rules! blend { + ($vec0: expr, $vec1: expr, $index:expr) => { + unsafe { + let v_n: [u32; 8] = [ + $vec0.extract::<0>(), + $vec0.extract::<1>(), + $vec0.extract::<2>(), + $vec0.extract::<3>(), + $vec1.extract::<0>(), + $vec1.extract::<1>(), + $vec1.extract::<2>(), + $vec1.extract::<3>(), + ]; + core::mem::transmute::<[u32; 4], u32x4>([ + v_n[$index[0]], + v_n[$index[1]], + v_n[$index[2]], + v_n[$index[3]], + ]) + } + }; +} + +/// Unpack 32-bit lanes: +/// ((a0, b0, a1, b1) ,(c0, d0, c1, d1)) +/// into +/// ((a0, b0), (c0, d0)) +/// ((a1, b1), (c1, d1)) +#[inline(always)] +fn unpack_pair(src: u32x4x2) -> (u32x2x2, u32x2x2) { + let a0: u32x2; + let a1: u32x2; + let b0: u32x2; + let b1: u32x2; + unsafe { + a0 = vget_low_u32(src.0 .0).into(); + a1 = vget_low_u32(src.0 .1).into(); + b0 = vget_high_u32(src.0 .0).into(); + b1 = vget_high_u32(src.0 .1).into(); + } + return (u32x2x2::new(a0, a1), u32x2x2::new(b0, b1)); +} + +/// ((a0, 0, b0, 0), (c0, 0, d0, 0)) +/// ((a1, 0, b1, 0), (c1, 0, d1, 0)) +/// into +/// ((a0, b0, a1, b1), (c0, d0, c1, d1)) +#[inline(always)] +#[rustfmt::skip] // Retain formatting of the return tuples +fn repack_pair(x: u32x4x2, y: u32x4x2) -> u32x4x2 { + unsafe { + u32x4x2::new( + vcombine_u32( + vtrn1_u32(vget_low_u32(x.0.0), vget_high_u32(x.0.0)), + vtrn1_u32(vget_low_u32(y.0.0), vget_high_u32(y.0.0))).into(), + vcombine_u32( + vtrn1_u32(vget_low_u32(x.0.1), vget_high_u32(x.0.1)), + vtrn1_u32(vget_low_u32(y.0.1), vget_high_u32(y.0.1))).into(), + ) + } +} + +#[derive(Copy, Clone, Debug)] +pub enum Lanes { + C, + D, + AB, + AC, + CD, + AD, + BC, + ABCD, +} + +#[derive(Copy, Clone, Debug)] +pub enum Shuffle { + AAAA, + BBBB, + CACA, + DBBD, + ADDA, + CBCB, + ABAB, + BADC, + BACD, + ABDC, +} + +#[derive(Clone, Copy, Debug)] +pub struct FieldElement2625x4(pub(crate) [u32x4x2; 5]); + +use subtle::Choice; +use subtle::ConditionallySelectable; + +impl ConditionallySelectable for FieldElement2625x4 { + fn conditional_select( + a: &FieldElement2625x4, + b: &FieldElement2625x4, + choice: Choice, + ) -> FieldElement2625x4 { + let mask = (-(choice.unwrap_u8() as i32)) as u32; + let mask_vec = u32x4x2::splat(mask); + FieldElement2625x4([ + a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])), + a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])), + a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])), + a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])), + a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])), + ]) + } + + fn conditional_assign(&mut self, other: &FieldElement2625x4, choice: Choice) { + let mask = (-(choice.unwrap_u8() as i32)) as u32; + let mask_vec = u32x4x2::splat(mask); + self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]); + self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]); + self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]); + self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]); + self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]); + } +} + +impl FieldElement2625x4 { + pub fn split(&self) -> [FieldElement51; 4] { + let mut out = [FieldElement51::ZERO; 4]; + for i in 0..5 { + let a_2i = self.0[i].extract::<0>() as u64; + let b_2i = self.0[i].extract::<1>() as u64; + let a_2i_1 = self.0[i].extract::<2>() as u64; + let b_2i_1 = self.0[i].extract::<3>() as u64; + let c_2i = self.0[i].extract::<4>() as u64; + let d_2i = self.0[i].extract::<5>() as u64; + let c_2i_1 = self.0[i].extract::<6>() as u64; + let d_2i_1 = self.0[i].extract::<7>() as u64; + + out[0].0[i] = a_2i + (a_2i_1 << 26); + out[1].0[i] = b_2i + (b_2i_1 << 26); + out[2].0[i] = c_2i + (c_2i_1 << 26); + out[3].0[i] = d_2i + (d_2i_1 << 26); + } + + out + } + + #[inline] + pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 { + #[inline(always)] + #[rustfmt::skip] // Retain format of the return tuples + fn shuffle_lanes(x: u32x4x2, control: Shuffle) -> u32x4x2 { + match control { + Shuffle::AAAA => shuffle!(x, [0, 0, 2, 2, 0, 0, 2, 2]), + Shuffle::BBBB => shuffle!(x, [1, 1, 3, 3, 1, 1, 3, 3]), + Shuffle::CACA => shuffle!(x, [4, 0, 6, 2, 4, 0, 6, 2]), + Shuffle::DBBD => shuffle!(x, [5, 1, 7, 3, 1, 5, 3, 7]), + Shuffle::ADDA => shuffle!(x, [0, 5, 2, 7, 5, 0, 7, 2]), + Shuffle::CBCB => shuffle!(x, [4, 1, 6, 3, 4, 1, 6, 3]), + Shuffle::ABAB => shuffle!(x, [0, 1, 2, 3, 0, 1, 2, 3]), + Shuffle::BADC => shuffle!(x, [1, 0, 3, 2, 5, 4, 7, 6]), + Shuffle::BACD => shuffle!(x, [1, 0, 3, 2, 4, 5, 6, 7]), + Shuffle::ABDC => shuffle!(x, [0, 1, 2, 3, 5, 4, 7, 6]), + } + } + + FieldElement2625x4([ + shuffle_lanes(self.0[0], control), + shuffle_lanes(self.0[1], control), + shuffle_lanes(self.0[2], control), + shuffle_lanes(self.0[3], control), + shuffle_lanes(self.0[4], control), + ]) + } + + pub fn shuffleABAB(&self) -> FieldElement2625x4 { + self.shuffle(Shuffle::ABAB) + } + + pub fn shuffleBACD(&self) -> FieldElement2625x4 { + self.shuffle(Shuffle::BACD) + } + + // Can probably be sped up using multiple vset/vget instead of table + #[inline] + pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 { + #[inline(always)] + #[rustfmt::skip] // Retain format of the return tuples + fn blend_lanes(x: u32x4x2, y: u32x4x2, control: Lanes) -> u32x4x2 { + let x0 = u32x4::from(x.0.0); + let x1 = u32x4::from(x.0.1); + let y0 = u32x4::from(y.0.0); + let y1 = u32x4::from(y.0.1); + match control { + Lanes::C => u32x4x2::new(x0, blend!(y1, x1, [0, 5, 2, 7])), + Lanes::D => u32x4x2::new(x0, blend!(y1, x1, [4, 1, 6, 3])), + Lanes::AD => u32x4x2::new(blend!(y0, x0, [0, 5, 2, 7]), blend!(y1, x1, [4, 1, 6, 3])), + Lanes::AB => u32x4x2::new(y0, x1), + Lanes::AC => u32x4x2::new(blend!(y0, x0, [0, 5, 2, 7]), blend!(y1, x1, [0, 5, 2, 7])), + Lanes::CD => u32x4x2::new(x0, y1), + Lanes::BC => u32x4x2::new(blend!(y0, x0, [4, 1, 6, 3]), blend!(y1, x1, [0, 5, 2, 7])), + Lanes::ABCD => y, + } + } + + FieldElement2625x4([ + blend_lanes(self.0[0], other.0[0], control), + blend_lanes(self.0[1], other.0[1], control), + blend_lanes(self.0[2], other.0[2], control), + blend_lanes(self.0[3], other.0[3], control), + blend_lanes(self.0[4], other.0[4], control), + ]) + } + + pub fn zero() -> FieldElement2625x4 { + FieldElement2625x4([u32x4x2::splat(0); 5]) + } + + pub fn splat(x: &FieldElement51) -> FieldElement2625x4 { + FieldElement2625x4::new(x, x, x, x) + } + + pub fn new( + x0: &FieldElement51, + x1: &FieldElement51, + x2: &FieldElement51, + x3: &FieldElement51, + ) -> FieldElement2625x4 { + let mut buf = [u32x4x2::splat(0); 5]; + let low_26_bits = (1 << 26) - 1; + for i in 0..5 { + let a_2i = (x0.0[i] & low_26_bits) as u32; + let a_2i_1 = (x0.0[i] >> 26) as u32; + let b_2i = (x1.0[i] & low_26_bits) as u32; + let b_2i_1 = (x1.0[i] >> 26) as u32; + let c_2i = (x2.0[i] & low_26_bits) as u32; + let c_2i_1 = (x2.0[i] >> 26) as u32; + let d_2i = (x3.0[i] & low_26_bits) as u32; + let d_2i_1 = (x3.0[i] >> 26) as u32; + + buf[i] = u32x4x2::new( + u32x4::new(a_2i, b_2i, a_2i_1, b_2i_1), + u32x4::new(c_2i, d_2i, c_2i_1, d_2i_1), + ); + } + return FieldElement2625x4(buf).reduce(); + } + + #[inline] + pub fn negate_lazy(&self) -> FieldElement2625x4 { + FieldElement2625x4([ + P_TIMES_2_LO - self.0[0], + P_TIMES_2_HI - self.0[1], + P_TIMES_2_HI - self.0[2], + P_TIMES_2_HI - self.0[3], + P_TIMES_2_HI - self.0[4], + ]) + } + + #[inline] + pub fn diff_sum(&self) -> FieldElement2625x4 { + let tmp1 = self.shuffle(Shuffle::BADC); + let tmp2 = self.blend(self.negate_lazy(), Lanes::AC); + tmp1 + tmp2 + } + + pub fn reduce(&self) -> FieldElement2625x4 { + // Negated for shift right instead of left + let shifts = ( + i32x4::new(-26, -26, -25, -25), + i32x4::new(-26, -26, -25, -25), + ); + let masks = u32x4x2::new( + u32x4::new((1 << 26) - 1, (1 << 26) - 1, (1 << 25) - 1, (1 << 25) - 1), + u32x4::new((1 << 26) - 1, (1 << 26) - 1, (1 << 25) - 1, (1 << 25) - 1), + ); + + // Use mutliple transposes instead of table lookup? + let rotated_carryout = |v: u32x4x2| -> u32x4x2 { + unsafe { + let c: u32x4x2 = u32x4x2::new( + vqshlq_u32(v.0 .0, shifts.0.into()).into(), + vqshlq_u32(v.0 .1, shifts.1.into()).into(), + ); + u32x4x2::new( + vcombine_u32(vget_high_u32(c.0 .0), vget_low_u32(c.0 .0)).into(), + vcombine_u32(vget_high_u32(c.0 .1), vget_low_u32(c.0 .1)).into(), + ) + } + }; + + let combine = |v_lo: u32x4x2, v_hi: u32x4x2| -> u32x4x2 { + unsafe { + u32x4x2::new( + vcombine_u32(vget_low_u32(v_lo.0 .0), vget_high_u32(v_hi.0 .0)).into(), + vcombine_u32(vget_low_u32(v_lo.0 .1), vget_high_u32(v_hi.0 .1)).into(), + ) + } + }; + + let mut v = self.0; + + let c10 = rotated_carryout(v[0]); + let mut com = combine(u32x4x2::splat(0), c10); + v[0] = (v[0] & masks) + com; + + let c32 = rotated_carryout(v[1]); + com = combine(c10, c32); + v[1] = (v[1] & masks) + com; + + let c54 = rotated_carryout(v[2]); + com = combine(c32, c54); + v[2] = (v[2] & masks) + com; + + let c76 = rotated_carryout(v[3]); + com = combine(c54, c76); + v[3] = (v[3] & masks) + com; + + let c98 = rotated_carryout(v[4]); + com = combine(c76, c98); + v[4] = (v[4] & masks) + com; + + #[rustfmt::skip] // Retain formatting of return tuple + let c9_19: u32x4x2 = unsafe { + let c9_19_spread: u32x4x2 = u32x4x2::new( + vmulq_n_u32(c98.0.0, 19).into(), + vmulq_n_u32(c98.0.1, 19).into(), + ); + + u32x4x2::new( + vcombine_u32(vget_low_u32(c9_19_spread.0.0), u32x2::splat(0).into()).into(), + vcombine_u32(vget_low_u32(c9_19_spread.0.1), u32x2::splat(0).into()).into()) + }; + v[0] = v[0] + c9_19; + + FieldElement2625x4(v) + } + + #[inline] + #[rustfmt::skip] // Retain formatting of carry and repacking + fn reduce64(mut z: [u64x2x2; 10]) -> FieldElement2625x4 { + #[allow(non_snake_case)] + let LOW_25_BITS: u64x2x2 = u64x2x2::splat((1 << 25) - 1); + #[allow(non_snake_case)] + let LOW_26_BITS: u64x2x2 = u64x2x2::splat((1 << 26) - 1); + + let carry = |z: &mut [u64x2x2; 10], i: usize| { + debug_assert!(i < 9); + if i % 2 == 0 { + z[i + 1] = z[i + 1] + (z[i].shr::<26>()); + z[i] = z[i] & LOW_26_BITS; + } else { + z[i + 1] = z[i + 1] + (z[i].shr::<25>()); + z[i] = z[i] & LOW_25_BITS; + } + }; + + carry(&mut z, 0); carry(&mut z, 4); + carry(&mut z, 1); carry(&mut z, 5); + carry(&mut z, 2); carry(&mut z, 6); + carry(&mut z, 3); carry(&mut z, 7); + carry(&mut z, 4); carry(&mut z, 8); + + let c = z[9].shr::<25>(); + z[9] = z[9] & LOW_25_BITS; + let mut c0: u64x2x2 = c & LOW_26_BITS; + let mut c1: u64x2x2 = c.shr::<26>(); + + unsafe { + c0 = u64x2x2::new( + vmulq_n_u32(vreinterpretq_u32_u64(c0.0.0), 19).into(), + vmulq_n_u32(vreinterpretq_u32_u64(c0.0.1), 19).into()); + c1 = u64x2x2::new( + vmulq_n_u32(vreinterpretq_u32_u64(c1.0.0), 19).into(), + vmulq_n_u32(vreinterpretq_u32_u64(c1.0.1), 19).into()); + } + + z[0] = z[0] + c0; + z[1] = z[1] + c1; + carry(&mut z, 0); + + FieldElement2625x4([ + repack_pair(u32x4x2::new(z[0].0.0.into(), z[0].0.1.into()), u32x4x2::new(z[1].0.0.into(), z[1].0.1.into())), + repack_pair(u32x4x2::new(z[2].0.0.into(), z[2].0.1.into()), u32x4x2::new(z[3].0.0.into(), z[3].0.1.into())), + repack_pair(u32x4x2::new(z[4].0.0.into(), z[4].0.1.into()), u32x4x2::new(z[5].0.0.into(), z[5].0.1.into())), + repack_pair(u32x4x2::new(z[6].0.0.into(), z[6].0.1.into()), u32x4x2::new(z[7].0.0.into(), z[7].0.1.into())), + repack_pair(u32x4x2::new(z[8].0.0.into(), z[8].0.1.into()), u32x4x2::new(z[9].0.0.into(), z[9].0.1.into())), + ]) + } + + #[allow(non_snake_case)] + #[rustfmt::skip] // keep alignment of formulas + pub fn square_and_negate_D(&self) -> FieldElement2625x4 { + #[inline(always)] + fn m(x: u32x2x2, y: u32x2x2) -> u64x2x2 { + unsafe { + let z0: u64x2 = vmull_u32(x.0.0, y.0.0).into(); + let z1: u64x2 = vmull_u32(x.0.1, y.0.1).into(); + u64x2x2::new(z0, z1) + } + } + + #[inline(always)] + fn m_lo(x: u32x2x2, y: u32x2x2) -> u32x2x2 { + unsafe { + let x: u32x4x2 = u32x4x2::new( + vmull_u32(x.0.0, y.0.0).into(), + vmull_u32(x.0.1, y.0.1).into(), + ); + u32x2x2::new( + vuzp1_u32(vget_low_u32(x.0.0), vget_high_u32(x.0.0)).into(), + vuzp1_u32(vget_low_u32(x.0.1), vget_high_u32(x.0.1)).into() + ) + } + } + + let v19 = u32x2x2::new(u32x2::new(19, 19), u32x2::new(19, 19)); + + let (x0, x1) = unpack_pair(self.0[0]); + let (x2, x3) = unpack_pair(self.0[1]); + let (x4, x5) = unpack_pair(self.0[2]); + let (x6, x7) = unpack_pair(self.0[3]); + let (x8, x9) = unpack_pair(self.0[4]); + + let x0_2 = x0.shl::<1>(); + let x1_2 = x1.shl::<1>(); + let x2_2 = x2.shl::<1>(); + let x3_2 = x3.shl::<1>(); + let x4_2 = x4.shl::<1>(); + let x5_2 = x5.shl::<1>(); + let x6_2 = x6.shl::<1>(); + let x7_2 = x7.shl::<1>(); + + let x5_19 = m_lo(v19, x5); + let x6_19 = m_lo(v19, x6); + let x7_19 = m_lo(v19, x7); + let x8_19 = m_lo(v19, x8); + let x9_19 = m_lo(v19, x9); + + let z0 = m(x0, x0) + m(x2_2,x8_19) + m(x4_2,x6_19) + ((m(x1_2,x9_19) + m(x3_2,x7_19) + m(x5,x5_19)).shl::<1>()); + let z1 = m(x0_2,x1) + m(x3_2,x8_19) + m(x5_2,x6_19) + ((m(x2,x9_19) + m(x4,x7_19)).shl::<1>()); + let z2 = m(x0_2,x2) + m(x1_2,x1) + m(x4_2,x8_19) + m(x6,x6_19) + ((m(x3_2,x9_19) + m(x5_2,x7_19)).shl::<1>()); + let z3 = m(x0_2,x3) + m(x1_2,x2) + m(x5_2,x8_19) + ((m(x4,x9_19) + m(x6,x7_19)).shl::<1>()); + let z4 = m(x0_2,x4) + m(x1_2,x3_2) + m(x2, x2) + m(x6_2,x8_19) + ((m(x5_2,x9_19) + m(x7,x7_19)).shl::<1>()); + let z5 = m(x0_2,x5) + m(x1_2,x4) + m(x2_2,x3) + m(x7_2,x8_19) + ((m(x6,x9_19)).shl::<1>()); + let z6 = m(x0_2,x6) + m(x1_2,x5_2) + m(x2_2,x4) + m(x3_2,x3) + m(x8,x8_19) + ((m(x7_2,x9_19)).shl::<1>()); + let z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)).shl::<1>()); + let z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)).shl::<1>()); + let z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5); + + + let low__p37 = u64x2x2::splat(0x3ffffed << 37); + let even_p37 = u64x2x2::splat(0x3ffffff << 37); + let odd__p37 = u64x2x2::splat(0x1ffffff << 37); + + let negate_D = |x_01: u64x2x2, p_01: u64x2x2| -> u64x2x2 { + unsafe { + + u64x2x2::new(u64x2(x_01.0.0), + vcombine_u32( + vget_low_u32(vreinterpretq_u32_u64(x_01.0.1)), + vget_high_u32(vreinterpretq_u32_u64(vsubq_u64(p_01.0.1, x_01.0.1)))).into()) + } + }; + + let z0s = negate_D(z0, low__p37); + let z1s = negate_D(z1, odd__p37); + let z2s = negate_D(z2, even_p37); + let z3s = negate_D(z3, odd__p37); + let z4s = negate_D(z4, even_p37); + let z5s = negate_D(z5, odd__p37); + let z6s = negate_D(z6, even_p37); + let z7s = negate_D(z7, odd__p37); + let z8s = negate_D(z8, even_p37); + let z9s = negate_D(z9, odd__p37); + + FieldElement2625x4::reduce64([z0s, z1s, z2s, z3s, z4s, z5s, z6s, z7s, z8s, z9s]) + } +} + +impl Neg for FieldElement2625x4 { + type Output = FieldElement2625x4; + #[inline] + fn neg(self) -> FieldElement2625x4 { + FieldElement2625x4([ + P_TIMES_16_LO - self.0[0], + P_TIMES_16_HI - self.0[1], + P_TIMES_16_HI - self.0[2], + P_TIMES_16_HI - self.0[3], + P_TIMES_16_HI - self.0[4], + ]) + .reduce() + } +} + +impl Add for FieldElement2625x4 { + type Output = FieldElement2625x4; + #[inline] + fn add(self, rhs: FieldElement2625x4) -> FieldElement2625x4 { + FieldElement2625x4([ + self.0[0] + rhs.0[0], + self.0[1] + rhs.0[1], + self.0[2] + rhs.0[2], + self.0[3] + rhs.0[3], + self.0[4] + rhs.0[4], + ]) + } +} + +impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 { + type Output = FieldElement2625x4; + #[inline] + #[rustfmt::skip] // Retain formatting of packing + fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 { + unsafe { + let consts = ( + u32x2::new(scalars.0, scalars.1), + u32x2::new(scalars.2, scalars.3), + ); + + let (b0, b1) = unpack_pair(self.0[0]); + let (b2, b3) = unpack_pair(self.0[1]); + let (b4, b5) = unpack_pair(self.0[2]); + let (b6, b7) = unpack_pair(self.0[3]); + let (b8, b9) = unpack_pair(self.0[4]); + + FieldElement2625x4::reduce64([ + u64x2x2::new(vmull_u32(b0.0.0, consts.0.into()).into(), vmull_u32(b0.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b1.0.0, consts.0.into()).into(), vmull_u32(b1.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b2.0.0, consts.0.into()).into(), vmull_u32(b2.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b3.0.0, consts.0.into()).into(), vmull_u32(b3.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b4.0.0, consts.0.into()).into(), vmull_u32(b4.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b5.0.0, consts.0.into()).into(), vmull_u32(b5.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b6.0.0, consts.0.into()).into(), vmull_u32(b6.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b7.0.0, consts.0.into()).into(), vmull_u32(b7.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b8.0.0, consts.0.into()).into(), vmull_u32(b8.0.1, consts.1.into()).into()), + u64x2x2::new(vmull_u32(b9.0.0, consts.0.into()).into(), vmull_u32(b9.0.1, consts.1.into()).into()) + ]) + } + } +} + +impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 { + type Output = FieldElement2625x4; + + #[rustfmt::skip] // Retain formatting of z_i computation + fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 { + #[inline(always)] + fn m(x: u32x2x2, y: u32x2x2) -> u64x2x2 { + unsafe { + let z0: u64x2 = vmull_u32(x.0.0, y.0.0).into(); + let z1: u64x2 = vmull_u32(x.0.1, y.0.1).into(); + u64x2x2::new(z0, z1) + } + } + + #[inline(always)] + fn m_lo(x: u32x2x2, y: u32x2x2) -> u32x2x2 { + unsafe { + let x: u32x4x2 = u32x4x2::new( + vmull_u32(x.0.0, y.0.0).into(), + vmull_u32(x.0.1, y.0.1).into(), + ); + u32x2x2::new( + vuzp1_u32(vget_low_u32(x.0.0), vget_high_u32(x.0.0)).into(), + vuzp1_u32(vget_low_u32(x.0.1), vget_high_u32(x.0.1)).into() + ) + } + } + + + let (x0, x1) = unpack_pair(self.0[0]); + let (x2, x3) = unpack_pair(self.0[1]); + let (x4, x5) = unpack_pair(self.0[2]); + let (x6, x7) = unpack_pair(self.0[3]); + let (x8, x9) = unpack_pair(self.0[4]); + + let (y0, y1) = unpack_pair(rhs.0[0]); + let (y2, y3) = unpack_pair(rhs.0[1]); + let (y4, y5) = unpack_pair(rhs.0[2]); + let (y6, y7) = unpack_pair(rhs.0[3]); + let (y8, y9) = unpack_pair(rhs.0[4]); + + let v19 = u32x2x2::new(u32x2::new(19, 19), u32x2::new(19, 19)); + + let y1_19 = m_lo(v19, y1); + let y2_19 = m_lo(v19, y2); + let y3_19 = m_lo(v19, y3); + let y4_19 = m_lo(v19, y4); + let y5_19 = m_lo(v19, y5); + let y6_19 = m_lo(v19, y6); + let y7_19 = m_lo(v19, y7); + let y8_19 = m_lo(v19, y8); + let y9_19 = m_lo(v19, y9); + + let x1_2 = x1 + x1; + let x3_2 = x3 + x3; + let x5_2 = x5 + x5; + let x7_2 = x7 + x7; + let x9_2 = x9 + x9; + + let z0 = m(x0,y0) + m(x1_2,y9_19) + m(x2,y8_19) + m(x3_2,y7_19) + m(x4,y6_19) + m(x5_2,y5_19) + m(x6,y4_19) + m(x7_2,y3_19) + m(x8,y2_19) + m(x9_2,y1_19); + let z1 = m(x0,y1) + m(x1,y0) + m(x2,y9_19) + m(x3,y8_19) + m(x4,y7_19) + m(x5,y6_19) + m(x6,y5_19) + m(x7,y4_19) + m(x8,y3_19) + m(x9,y2_19); + let z2 = m(x0,y2) + m(x1_2,y1) + m(x2,y0) + m(x3_2,y9_19) + m(x4,y8_19) + m(x5_2,y7_19) + m(x6,y6_19) + m(x7_2,y5_19) + m(x8,y4_19) + m(x9_2,y3_19); + let z3 = m(x0,y3) + m(x1,y2) + m(x2,y1) + m(x3,y0) + m(x4,y9_19) + m(x5,y8_19) + m(x6,y7_19) + m(x7,y6_19) + m(x8,y5_19) + m(x9,y4_19); + let z4 = m(x0,y4) + m(x1_2,y3) + m(x2,y2) + m(x3_2,y1) + m(x4,y0) + m(x5_2,y9_19) + m(x6,y8_19) + m(x7_2,y7_19) + m(x8,y6_19) + m(x9_2,y5_19); + let z5 = m(x0,y5) + m(x1,y4) + m(x2,y3) + m(x3,y2) + m(x4,y1) + m(x5,y0) + m(x6,y9_19) + m(x7,y8_19) + m(x8,y7_19) + m(x9,y6_19); + let z6 = m(x0,y6) + m(x1_2,y5) + m(x2,y4) + m(x3_2,y3) + m(x4,y2) + m(x5_2,y1) + m(x6,y0) + m(x7_2,y9_19) + m(x8,y8_19) + m(x9_2,y7_19); + let z7 = m(x0,y7) + m(x1,y6) + m(x2,y5) + m(x3,y4) + m(x4,y3) + m(x5,y2) + m(x6,y1) + m(x7,y0) + m(x8,y9_19) + m(x9,y8_19); + let z8 = m(x0,y8) + m(x1_2,y7) + m(x2,y6) + m(x3_2,y5) + m(x4,y4) + m(x5_2,y3) + m(x6,y2) + m(x7_2,y1) + m(x8,y0) + m(x9_2,y9_19); + let z9 = m(x0,y9) + m(x1,y8) + m(x2,y7) + m(x3,y6) + m(x4,y5) + m(x5,y4) + m(x6,y3) + m(x7,y2) + m(x8,y1) + m(x9,y0); + + let f = |x: u64x2x2| -> u64x2x2 { + u64x2x2::new( + x.0.0.into(), + x.0.1.into() + ) + }; + + FieldElement2625x4::reduce64([ + f(z0), + f(z1), + f(z2), + f(z3), + f(z4), + f(z5), + f(z6), + f(z7), + f(z8), + f(z9), + ]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_unpack_repack_pair() { + let x0 = FieldElement51([10000 + (10001 << 26), 0, 0, 0, 0]); + let x1 = FieldElement51([10100 + (10101 << 26), 0, 0, 0, 0]); + let x2 = FieldElement51([10200 + (10201 << 26), 0, 0, 0, 0]); + let x3 = FieldElement51([10300 + (10301 << 26), 0, 0, 0, 0]); + + let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3); + + let src = vec.0[0]; + + let (a, b) = unpack_pair(src); + + let expected_a = u32x2x2::new(u32x2::new(10000, 10100), u32x2::new(10200, 10300)); + let expected_b = u32x2x2::new(u32x2::new(10001, 10101), u32x2::new(10201, 10301)); + + assert_eq!(a, expected_a); + assert_eq!(b, expected_b); + + let expected_src = repack_pair( + u32x4x2::new( + u32x4::new(a.extract::<0>(), 0, a.extract::<1>(), 0), + u32x4::new(a.extract::<2>(), 0, a.extract::<3>(), 0), + ), + u32x4x2::new( + u32x4::new(b.extract::<0>(), 0, b.extract::<1>(), 0), + u32x4::new(b.extract::<2>(), 0, b.extract::<3>(), 0), + ), + ); + + assert_eq!(src, expected_src); + } + + #[test] + fn scale_by_curve_constants() { + let mut x = FieldElement2625x4::splat(&FieldElement51::ONE); + + x = x * (121666, 121666, 2 * 121666, 2 * 121665); + + let xs = x.split(); + assert_eq!(xs[0], FieldElement51([121666, 0, 0, 0, 0])); + assert_eq!(xs[1], FieldElement51([121666, 0, 0, 0, 0])); + assert_eq!(xs[2], FieldElement51([2 * 121666, 0, 0, 0, 0])); + assert_eq!(xs[3], FieldElement51([2 * 121665, 0, 0, 0, 0])); + } + + #[test] + fn diff_sum_vs_serial() { + let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]); + let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]); + let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]); + let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]); + + let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3).diff_sum(); + + let result = vec.split(); + + assert_eq!(result[0], &x1 - &x0); + assert_eq!(result[1], &x1 + &x0); + assert_eq!(result[2], &x3 - &x2); + assert_eq!(result[3], &x3 + &x2); + } + + #[test] + fn square_vs_serial() { + let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]); + let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]); + let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]); + let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]); + + let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3); + + let result = vec.square_and_negate_D().split(); + + assert_eq!(result[0], &x0 * &x0); + assert_eq!(result[1], &x1 * &x1); + assert_eq!(result[2], &x2 * &x2); + assert_eq!(result[3], -&(&x3 * &x3)); + } + + #[test] + fn multiply_vs_serial() { + let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]); + let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]); + let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]); + let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]); + + let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3); + let vecprime = vec.clone(); + + let result = (&vec * &vecprime).split(); + + assert_eq!(result[0], &x0 * &x0); + assert_eq!(result[1], &x1 * &x1); + assert_eq!(result[2], &x2 * &x2); + assert_eq!(result[3], &x3 * &x3); + } + + #[test] + fn new_split_roundtrips() { + let x0 = FieldElement51::from_bytes(&[0x10; 32]); + let x1 = FieldElement51::from_bytes(&[0x11; 32]); + let x2 = FieldElement51::from_bytes(&[0x12; 32]); + let x3 = FieldElement51::from_bytes(&[0x13; 32]); + + let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3); + + let splits = vec.split(); + + assert_eq!(x0, splits[0]); + assert_eq!(x1, splits[1]); + assert_eq!(x2, splits[2]); + assert_eq!(x3, splits[3]); + } +} diff --git a/curve25519-dalek/src/backend/vector/neon/mod.rs b/curve25519-dalek/src/backend/vector/neon/mod.rs new file mode 100644 index 000000000..0e279f715 --- /dev/null +++ b/curve25519-dalek/src/backend/vector/neon/mod.rs @@ -0,0 +1,20 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence +// See LICENSE for licensing information. +// +// Authors: +// - Isis Agora Lovecruft +// - Henry de Valence +// - Robrecht Blancquaert + +pub(crate) mod field; + +pub(crate) mod edwards; + +pub(crate) mod constants; + +pub(crate) use self::edwards::{CachedPoint, ExtendedPoint}; + +pub mod packed_simd; diff --git a/curve25519-dalek/src/backend/vector/neon/packed_simd.rs b/curve25519-dalek/src/backend/vector/neon/packed_simd.rs new file mode 100644 index 000000000..6f0c71525 --- /dev/null +++ b/curve25519-dalek/src/backend/vector/neon/packed_simd.rs @@ -0,0 +1,503 @@ +// -*- mode: rust; -*- +// +// This file is part of curve25519-dalek. +// See LICENSE for licensing information. + +//! This module defines wrappers over platform-specific SIMD types to make them +//! more convenient to use. +//! +//! This is an adaptation of `crate::backend::vector::packed_simd.rs` for aarch64. + +use core::ops::{Add, AddAssign, BitAnd, BitAndAssign, BitXor, BitXorAssign, Sub}; + +#[cfg(target_arch = "aarch64")] +use core::arch::aarch64 as neon; +#[cfg(target_arch = "arm")] +use core::arch::arm as neon; + +use core::simd::*; + +macro_rules! impl_shared { + ( + $ty:ident, // Name of the struct + $lane_ty:ident, + $internal_ty: ident, + $beq_intrinsic:ident, + $add_intrinsic:ident, + $sub_intrinsic:ident, + $and_intrinsic:ident, + $xor_intrinsic:ident, + $shl_intrinsic:ident, + $shr_intrinsic:ident, + $extract_intrinsic:ident + ) => { + #[allow(non_camel_case_types)] + #[derive(Copy, Clone, Debug)] + #[repr(transparent)] + pub struct $ty(pub neon::$internal_ty); + + impl From<$ty> for neon::$internal_ty { + #[inline] + fn from(value: $ty) -> neon::$internal_ty { + value.0 + } + } + + impl From for $ty { + #[inline] + fn from(value: neon::$internal_ty) -> $ty { + $ty(value) + } + } + + impl PartialEq for $ty { + #[inline] + fn eq(&self, rhs: &$ty) -> bool { + unsafe { + let m = neon::$beq_intrinsic(self.0, rhs.0); + Self(m).extract::<0>() != 0 + } + } + } + + impl Eq for $ty {} + + impl Add for $ty { + type Output = Self; + + #[inline] + fn add(self, rhs: $ty) -> Self { + unsafe { neon::$add_intrinsic(self.0, rhs.0).into() } + } + } + + impl AddAssign for $ty { + #[inline] + fn add_assign(&mut self, rhs: $ty) { + *self = *self + rhs + } + } + + impl Sub for $ty { + type Output = Self; + + #[inline] + fn sub(self, rhs: $ty) -> Self { + unsafe { neon::$sub_intrinsic(self.0, rhs.0).into() } + } + } + + impl BitAnd for $ty { + type Output = Self; + + #[inline] + fn bitand(self, rhs: $ty) -> Self { + unsafe { neon::$and_intrinsic(self.0, rhs.0).into() } + } + } + + impl BitAndAssign for $ty { + #[inline] + fn bitand_assign(&mut self, rhs: $ty) { + *self = *self & rhs; + } + } + + impl BitXor for $ty { + type Output = Self; + + #[inline] + fn bitxor(self, rhs: $ty) -> Self { + unsafe { neon::$xor_intrinsic(self.0, rhs.0).into() } + } + } + + impl BitXorAssign for $ty { + #[inline] + fn bitxor_assign(&mut self, rhs: $ty) { + *self = *self ^ rhs; + } + } + + impl $ty { + #[inline] + pub fn extract(self) -> $lane_ty { + unsafe { neon::$extract_intrinsic(self.0, N) as $lane_ty } + } + + #[inline] + pub fn shl(self) -> Self { + unsafe { neon::$shl_intrinsic(self.0, N).into() } + } + + #[inline] + pub fn shr(self) -> Self { + unsafe { neon::$shr_intrinsic(self.0, N).into() } + } + } + }; +} + +impl_shared!( + u32x4, + u32, + uint32x4_t, + vceqq_u32, + vaddq_u32, + vsubq_u32, + vandq_u32, + veorq_u32, + vshlq_n_u32, + vshrq_n_u32, + vgetq_lane_u32 +); + +impl u32x4 { + #[inline] + pub fn new(x0: u32, x1: u32, x2: u32, x3: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 4], Self>([x0, x1, x2, x3]) } + } + + #[inline] + pub const fn const_new(x0: u32, x1: u32, x2: u32, x3: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 4], Self>([x0, x1, x2, x3]) } + } + + #[inline] + pub fn splat(x: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 4], Self>([x, x, x, x]) } + } + + #[inline] + pub const fn const_splat(x: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 4], Self>([x, x, x, x]) } + } +} + +impl From for neon::uint32x4_t { + #[inline] + fn from(value: u64x2) -> neon::uint32x4_t { + unsafe { neon::vreinterpretq_u32_u64(value.into()) } + } +} + +impl From for u32x4 { + #[inline] + fn from(value: neon::uint64x2_t) -> u32x4 { + unsafe { neon::vreinterpretq_u32_u64(value).into() } + } +} + +impl From for u32x4 { + #[inline] + fn from(value: u64x2) -> u32x4 { + Into::::into(value).into() + } +} + +impl_shared!( + u32x2, + u32, + uint32x2_t, + vceq_u32, + vadd_u32, + vsub_u32, + vand_u32, + veor_u32, + vshl_n_u32, + vshr_n_u32, + vget_lane_u32 +); + +impl u32x2 { + #[inline] + pub fn new(x0: u32, x1: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 2], Self>([x0, x1]) } + } + + #[inline] + pub fn splat(x: u32) -> Self { + unsafe { core::mem::transmute::<[u32; 2], Self>([x, x]) } + } +} + +impl_shared!( + u64x2, + u64, + uint64x2_t, + vceqq_u64, + vaddq_u64, + vsubq_u64, + vandq_u64, + veorq_u64, + vshlq_n_u64, + vshrq_n_u64, + vgetq_lane_u64 +); + +impl u64x2 { + #[inline] + pub fn new(x0: u64, x1: u64) -> Self { + unsafe { core::mem::transmute::<[u64; 2], Self>([x0, x1]) } + } + + #[inline] + pub fn splat(x: u64) -> Self { + unsafe { core::mem::transmute::<[u64; 2], Self>([x, x]) } + } +} + +impl From for u64x2 { + #[inline] + fn from(value: neon::uint32x4_t) -> u64x2 { + unsafe { neon::vreinterpretq_u64_u32(value).into() } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct u32x4x2(pub neon::uint32x4x2_t); + +impl PartialEq for u32x4x2 { + fn eq(&self, other: &Self) -> bool { + u32x4::from(self.0 .0) == u32x4::from(other.0 .0) + && u32x4::from(self.0 .1) == u32x4::from(other.0 .1) + } +} + +impl From for neon::uint32x4x2_t { + #[inline] + fn from(value: u32x4x2) -> neon::uint32x4x2_t { + value.0 + } +} + +impl From for u32x4x2 { + #[inline] + fn from(value: neon::uint32x4x2_t) -> u32x4x2 { + u32x4x2(value) + } +} + +impl BitXor for u32x4x2 { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + Self::new( + u32x4::from(self.0 .0) ^ u32x4::from(rhs.0 .0), + u32x4::from(self.0 .1) ^ u32x4::from(rhs.0 .1), + ) + } +} + +impl BitXorAssign for u32x4x2 { + fn bitxor_assign(&mut self, rhs: Self) { + *self = *self ^ rhs; + } +} + +impl BitAnd for u32x4x2 { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + Self::new( + u32x4::from(self.0 .0) & u32x4::from(rhs.0 .0), + u32x4::from(self.0 .1) & u32x4::from(rhs.0 .1), + ) + } +} + +impl Add for u32x4x2 { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self::new( + u32x4::from(self.0 .0) + u32x4::from(rhs.0 .0), + u32x4::from(self.0 .1) + u32x4::from(rhs.0 .1), + ) + } +} + +impl Sub for u32x4x2 { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self::new( + u32x4::from(self.0 .0) - u32x4::from(rhs.0 .0), + u32x4::from(self.0 .1) - u32x4::from(rhs.0 .1), + ) + } +} + +impl u32x4x2 { + #[inline] + pub const fn new(x0: u32x4, x1: u32x4) -> Self { + Self(neon::uint32x4x2_t(x0.0, x1.0)) + } + + #[inline] + pub const fn splat(x: u32) -> Self { + Self(neon::uint32x4x2_t( + u32x4::const_splat(x).0, + u32x4::const_splat(x).0, + )) + } + + #[inline] + pub fn extract(self) -> u32 { + match N { + 0 => u32x4::from(self.0 .0).extract::<0>(), + 1 => u32x4::from(self.0 .0).extract::<1>(), + 2 => u32x4::from(self.0 .0).extract::<2>(), + 3 => u32x4::from(self.0 .0).extract::<3>(), + 4 => u32x4::from(self.0 .1).extract::<0>(), + 5 => u32x4::from(self.0 .1).extract::<1>(), + 6 => u32x4::from(self.0 .1).extract::<2>(), + 7 => u32x4::from(self.0 .1).extract::<3>(), + _ => unreachable!(), + } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct u32x2x2(pub neon::uint32x2x2_t); + +impl PartialEq for u32x2x2 { + fn eq(&self, other: &Self) -> bool { + u32x2::from(self.0 .0) == u32x2::from(other.0 .0) + && u32x2::from(self.0 .1) == u32x2::from(other.0 .1) + } +} + +impl Add for u32x2x2 { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self::new( + u32x2::from(self.0 .0) + u32x2::from(rhs.0 .0), + u32x2::from(self.0 .1) + u32x2::from(rhs.0 .1), + ) + } +} + +impl u32x2x2 { + #[inline] + pub fn new(x0: u32x2, x1: u32x2) -> Self { + Self(neon::uint32x2x2_t(x0.0, x1.0)) + } + + #[inline] + pub fn shl(self) -> Self { + Self(neon::uint32x2x2_t( + u32x2::from(self.0 .0).shl::().0, + u32x2::from(self.0 .1).shl::().0, + )) + } + + #[inline] + pub fn extract(self) -> u32 { + match N { + 0 => u32x2::from(self.0 .0).extract::<0>(), + 1 => u32x2::from(self.0 .0).extract::<1>(), + 2 => u32x2::from(self.0 .1).extract::<0>(), + 3 => u32x2::from(self.0 .1).extract::<1>(), + _ => unreachable!(), + } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct i32x4(neon::int32x4_t); + +impl From for neon::int32x4_t { + #[inline] + fn from(value: i32x4) -> neon::int32x4_t { + value.0 + } +} + +impl From for i32x4 { + #[inline] + fn from(value: neon::int32x4_t) -> i32x4 { + i32x4(value) + } +} + +impl i32x4 { + #[inline] + pub fn new(x0: i32, x1: i32, x2: i32, x3: i32) -> Self { + unsafe { core::mem::transmute::<[i32; 4], Self>([x0, x1, x2, x3]) } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct u64x2x2(pub neon::uint64x2x2_t); + +impl Add for u64x2x2 { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self { + Self(neon::uint64x2x2_t( + (u64x2::from(self.0 .0) + u64x2::from(rhs.0 .0)).into(), + (u64x2::from(self.0 .1) + u64x2::from(rhs.0 .1)).into(), + )) + } +} + +impl BitAnd for u64x2x2 { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + Self(neon::uint64x2x2_t( + (u64x2::from(self.0 .0) & u64x2::from(rhs.0 .0)).into(), + (u64x2::from(self.0 .1) & u64x2::from(rhs.0 .1)).into(), + )) + } +} + +impl u64x2x2 { + #[inline] + pub fn new(x0: u64x2, x1: u64x2) -> Self { + Self(neon::uint64x2x2_t(x0.0, x1.0)) + } + + #[inline] + pub fn splat(x: u64) -> Self { + Self::new(u64x2::splat(x), u64x2::splat(x)) + } + + #[inline] + pub fn extract(self) -> u64 { + match N { + 0 => u64x2::from(self.0 .0).extract::<0>(), + 1 => u64x2::from(self.0 .0).extract::<1>(), + 2 => u64x2::from(self.0 .1).extract::<0>(), + 3 => u64x2::from(self.0 .1).extract::<1>(), + _ => unreachable!(), + } + } + + #[inline] + pub fn shl(self) -> Self { + Self(neon::uint64x2x2_t( + u64x2::from(self.0 .0).shl::().into(), + u64x2::from(self.0 .1).shl::().into(), + )) + } + + #[inline] + pub fn shr(self) -> Self { + Self(neon::uint64x2x2_t( + u64x2::from(self.0 .0).shr::().into(), + u64x2::from(self.0 .1).shr::().into(), + )) + } +} diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/pippenger.rs b/curve25519-dalek/src/backend/vector/scalar_mul/pippenger.rs index 1376c4eab..5a643b8b5 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul/pippenger.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul/pippenger.rs @@ -10,8 +10,9 @@ #![allow(non_snake_case)] #[curve25519_dalek_derive::unsafe_target_feature_specialize( - "avx2", - conditional("avx512ifma,avx512vl", nightly) + conditional("avx2", target_arch = "x86_64"), + conditional("avx512ifma,avx512vl", all(nightly, target_arch = "x86_64")), + conditional("neon", all(nightly, target_arch = "aarch64")) )] pub mod spec { @@ -26,6 +27,9 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + #[for_target_feature("neon")] + use crate::backend::vector::neon::{CachedPoint, ExtendedPoint}; + use crate::edwards::EdwardsPoint; use crate::scalar::Scalar; use crate::traits::{Identity, VartimeMultiscalarMul}; diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/precomputed_straus.rs b/curve25519-dalek/src/backend/vector/scalar_mul/precomputed_straus.rs index 1f16ab3e1..500510b31 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul/precomputed_straus.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul/precomputed_straus.rs @@ -12,8 +12,9 @@ #![allow(non_snake_case)] #[curve25519_dalek_derive::unsafe_target_feature_specialize( - "avx2", - conditional("avx512ifma,avx512vl", nightly) + conditional("avx2", target_arch = "x86_64"), + conditional("avx512ifma,avx512vl", all(nightly, target_arch = "x86_64")), + conditional("neon", all(nightly, target_arch = "aarch64")) )] pub mod spec { @@ -28,6 +29,9 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + #[for_target_feature("neon")] + use crate::backend::vector::neon::{CachedPoint, ExtendedPoint}; + use crate::edwards::EdwardsPoint; use crate::scalar::Scalar; use crate::traits::Identity; diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/straus.rs b/curve25519-dalek/src/backend/vector/scalar_mul/straus.rs index 413e6fd9a..40bf0d9f7 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul/straus.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul/straus.rs @@ -12,8 +12,9 @@ #![allow(non_snake_case)] #[curve25519_dalek_derive::unsafe_target_feature_specialize( - "avx2", - conditional("avx512ifma,avx512vl", nightly) + conditional("avx2", target_arch = "x86_64"), + conditional("avx512ifma,avx512vl", all(nightly, target_arch = "x86_64")), + conditional("neon", all(nightly, target_arch = "aarch64")) )] pub mod spec { @@ -31,6 +32,9 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + #[for_target_feature("neon")] + use crate::backend::vector::neon::{CachedPoint, ExtendedPoint}; + use crate::edwards::EdwardsPoint; use crate::scalar::Scalar; use crate::traits::{Identity, MultiscalarMul, VartimeMultiscalarMul}; diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/variable_base.rs b/curve25519-dalek/src/backend/vector/scalar_mul/variable_base.rs index 9f924f286..00bd21cb0 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul/variable_base.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul/variable_base.rs @@ -1,8 +1,12 @@ #![allow(non_snake_case)] #[curve25519_dalek_derive::unsafe_target_feature_specialize( - "avx2", - conditional("avx512ifma,avx512vl", nightly) + conditional("avx2", target_arch = "x86_64"), + conditional("avx512ifma,avx512vl", all(nightly, target_arch = "x86_64")), + conditional( + "neon", + all(nightly, any(target_arch = "aarch64", target_arch = "arm")) + ) )] pub mod spec { @@ -12,6 +16,9 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + #[for_target_feature("neon")] + use crate::backend::vector::neon::{CachedPoint, ExtendedPoint}; + use crate::edwards::EdwardsPoint; use crate::scalar::Scalar; use crate::traits::Identity; diff --git a/curve25519-dalek/src/backend/vector/scalar_mul/vartime_double_base.rs b/curve25519-dalek/src/backend/vector/scalar_mul/vartime_double_base.rs index ea2af8ad4..83fcdcfe1 100644 --- a/curve25519-dalek/src/backend/vector/scalar_mul/vartime_double_base.rs +++ b/curve25519-dalek/src/backend/vector/scalar_mul/vartime_double_base.rs @@ -12,8 +12,9 @@ #![allow(non_snake_case)] #[curve25519_dalek_derive::unsafe_target_feature_specialize( - "avx2", - conditional("avx512ifma,avx512vl", nightly) + conditional("avx2", target_arch = "x86_64"), + conditional("avx512ifma,avx512vl", all(nightly, target_arch = "x86_64")), + conditional("neon", all(nightly, target_arch = "aarch64")) )] pub mod spec { @@ -25,6 +26,9 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; + #[for_target_feature("neon")] + use crate::backend::vector::neon::{CachedPoint, ExtendedPoint}; + #[cfg(feature = "precomputed-tables")] #[for_target_feature("avx2")] use crate::backend::vector::avx2::constants::BASEPOINT_ODD_LOOKUP_TABLE; @@ -33,6 +37,10 @@ pub mod spec { #[for_target_feature("avx512ifma")] use crate::backend::vector::ifma::constants::BASEPOINT_ODD_LOOKUP_TABLE; + #[cfg(feature = "precomputed-tables")] + #[for_target_feature("neon")] + use crate::backend::vector::neon::constants::BASEPOINT_ODD_LOOKUP_TABLE; + use crate::edwards::EdwardsPoint; use crate::scalar::Scalar; use crate::traits::Identity; diff --git a/curve25519-dalek/src/lib.rs b/curve25519-dalek/src/lib.rs index d8666453c..cab2a85d9 100644 --- a/curve25519-dalek/src/lib.rs +++ b/curve25519-dalek/src/lib.rs @@ -22,6 +22,13 @@ all(curve25519_dalek_backend = "simd", nightly), feature(avx512_target_feature) )] +#![cfg_attr(all(nightly, target_arch = "aarch64"), feature(portable_simd))] +#![cfg_attr(all(nightly, target_arch = "arm"), feature(portable_simd))] +#![cfg_attr(all(nightly, target_arch = "arm"), feature(arm_target_feature))] +#![cfg_attr( + all(nightly, target_arch = "arm"), + feature(stdarch_arm_neon_intrinsics) +)] #![cfg_attr(docsrs, feature(doc_auto_cfg, doc_cfg, doc_cfg_hide))] #![cfg_attr(docsrs, doc(cfg_hide(docsrs)))] //------------------------------------------------------------------------