[chore] use rayon work-stealing to improve evaluate_h (#28)

* chore: use rayon par_iter for more work-stealing in evaluate_h * chore: bump version to 0.4.1 * chore: turn off profiling * chore: add CI to check wasm build * feat: use Scroll's FFT instead of Taiko's on x86 empirically it has better performance (lower memory bandwidth?) * chore: remove target wasm32-unknown-unknown https://docs.rs/getrandom/latest/getrandom/#webassembly-support
axiom-crypto · Nov 23, 2023 · f335ffc · f335ffc
1 parent e841084
commit f335ffc
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 23 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,6 +21,27 @@ jobs:
           command: test
           args: --verbose --release --all --all-features
 
+  build:
+    name: Build target ${{ matrix.target }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        target:
+          - wasm32-wasi
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions-rs/toolchain@v1
+        with:
+          override: false
+      - name: Add target
+        run: rustup target add ${{ matrix.target }}
+      - name: cargo build
+        uses: actions-rs/cargo@v1
+        with:
+          command: build
+          args: --no-default-features --features batch,circuit-params --target ${{ matrix.target }}
+
   example:
     name: Examples on ubuntu
     runs-on: ubuntu-latest

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
@@ -1,12 +1,14 @@
 [package]
 name = "halo2-axiom"
-version = "0.4.0"
+version = "0.4.1"
 authors = [
     "Sean Bowe <[email protected]>",
     "Ying Tong Lai <[email protected]>",
     "Daira Hopwood <[email protected]>",
     "Jack Grigg <[email protected]>",
-    "Privacy Scaling Explorations team", "Taiko Labs", "Intrinsic Technologies"
+    "Privacy Scaling Explorations team",
+    "Taiko Labs",
+    "Intrinsic Technologies",
 ]
 edition = "2021"
 rust-version = "1.73.0"
@@ -63,7 +65,7 @@ group = "0.13"
 pairing = "0.23"
 halo2curves = { package = "halo2curves-axiom", version = "0.4.2", default-features = false, features = ["bits", "bn256-table", "derive_serde"] }
 rand = "0.8"
-rand_core = { version = "0.6", default-features = false}
+rand_core = { version = "0.6", default-features = false }
 tracing = "0.1"
 blake2b_simd = "1"
 rustc-hash = "1.1"
@@ -94,12 +96,7 @@ getrandom = { version = "0.2", features = ["js"] }
 default = ["batch", "multicore", "circuit-params"]
 multicore = ["maybe-rayon/threads"]
 dev-graph = ["plotters", "tabbycat"]
-test-dev-graph = [
-    "dev-graph",
-    "plotters/bitmap_backend",
-    "plotters/bitmap_encoder",
-    "plotters/ttf",
-]
+test-dev-graph = ["dev-graph", "plotters/bitmap_backend", "plotters/bitmap_encoder", "plotters/ttf"]
 gadget-traces = ["backtrace"]
 # thread-safe-region = []
 sanity-checks = []

diff --git a/halo2_proofs/src/fft.rs b/halo2_proofs/src/fft.rs
@@ -17,6 +17,10 @@ pub fn fft<Scalar: Field, G: FftGroup<Scalar>>(
     data: &FFTData<Scalar>,
     inverse: bool,
 ) {
+    // Empirically, the parallel implementation requires less memory bandwidth, which is more performant on x86_64.
+    #[cfg(target_arch = "x86_64")]
+    parallel::fft(a, omega, log_n, data, inverse);
+    #[cfg(not(target_arch = "x86_64"))]
     recursive::fft(a, omega, log_n, data, inverse)
 }
 
@@ -52,6 +56,18 @@ mod tests {
         );
         end_timer!(start);
 
+        let mut c = input.clone();
+        let l_c = c.len();
+        let start = start_timer!(|| format!("parallel fft {} ({})", a.len(), num_threads));
+        fft::parallel::fft(
+            &mut c,
+            domain.get_omega(),
+            k,
+            domain.get_fft_data(l_c),
+            false,
+        );
+        end_timer!(start);
+
         let mut b = input;
         let l_b = b.len();
         let start = start_timer!(|| format!("recursive fft {} ({})", a.len(), num_threads));
@@ -67,6 +83,7 @@ mod tests {
         for i in 0..n {
             //log_info(format!("{}: {} {}", i, a[i], b[i]));
             assert_eq!(a[i], b[i]);
+            assert_eq!(a[i], c[i]);
         }
     }
 

diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
@@ -1,12 +1,16 @@
 #![allow(clippy::too_many_arguments)]
+
 use crate::multicore;
 use crate::plonk::{lookup, permutation, Any, ProvingKey};
 use crate::poly::Basis;
 use crate::{
     arithmetic::{parallelize, CurveAffine},
     poly::{Coeff, ExtendedLagrangeCoeff, LagrangeCoeff, Polynomial, Rotation},
 };
+#[cfg(feature = "profile")]
+use ark_std::{end_timer, start_timer};
 use ff::{Field, PrimeField, WithSmallOrderMulGroup};
+use multicore::{IntoParallelIterator, ParallelIterator};
 
 use super::{ConstraintSystem, Expression};
 
@@ -286,9 +290,10 @@ impl<C: CurveAffine> Evaluator<C> {
         let mut current_extended_omega = one;
         let value_parts: Vec<Polynomial<C::ScalarExt, LagrangeCoeff>> = (0..num_parts)
             .map(|_| {
-                let fixed: Vec<Polynomial<C::ScalarExt, LagrangeCoeff>> = pk
-                    .fixed_polys
-                    .iter()
+                #[cfg(feature = "profile")]
+                let fixed_timer = start_timer!(|| "Fixed coeff_to_extended_part");
+                let fixed: Vec<Polynomial<C::ScalarExt, LagrangeCoeff>> = (&pk.fixed_polys)
+                    .into_par_iter()
                     .map(|p| domain.coeff_to_extended_part(p.clone(), current_extended_omega))
                     .collect();
                 let fixed = &fixed[..];
@@ -297,10 +302,14 @@ impl<C: CurveAffine> Evaluator<C> {
                     domain.coeff_to_extended_part(pk.l_last.clone(), current_extended_omega);
                 let l_active_row =
                     domain.coeff_to_extended_part(pk.l_active_row.clone(), current_extended_omega);
+                #[cfg(feature = "profile")]
+                end_timer!(fixed_timer);
 
+                #[cfg(feature = "profile")]
+                let advice_timer = start_timer!(|| "Advice coeff_to_extended_part");
                 // Calculate the advice and instance cosets
                 let advice: Vec<Vec<Polynomial<C::Scalar, LagrangeCoeff>>> = advice_polys
-                    .iter()
+                    .into_par_iter()
                     .map(|advice_polys| {
                         advice_polys
                             .iter()
@@ -310,8 +319,12 @@ impl<C: CurveAffine> Evaluator<C> {
                             .collect()
                     })
                     .collect();
+                #[cfg(feature = "profile")]
+                end_timer!(advice_timer);
+                #[cfg(feature = "profile")]
+                let instance_timer = start_timer!(|| "Instance coeff_to_extended_part");
                 let instance: Vec<Vec<Polynomial<C::Scalar, LagrangeCoeff>>> = instance_polys
-                    .iter()
+                    .into_par_iter()
                     .map(|instance_polys| {
                         instance_polys
                             .iter()
@@ -321,6 +334,8 @@ impl<C: CurveAffine> Evaluator<C> {
                             .collect()
                     })
                     .collect();
+                #[cfg(feature = "profile")]
+                end_timer!(instance_timer);
 
                 let mut values = domain.empty_lagrange();
 
@@ -332,6 +347,8 @@ impl<C: CurveAffine> Evaluator<C> {
                     .zip(lookups.iter())
                     .zip(permutations.iter())
                 {
+                    #[cfg(feature = "profile")]
+                    let timer = start_timer!(|| "Custom gates");
                     // Custom gates
                     multicore::scope(|scope| {
                         let chunk_size = (size + num_threads - 1) / num_threads;
@@ -360,7 +377,11 @@ impl<C: CurveAffine> Evaluator<C> {
                             });
                         }
                     });
+                    #[cfg(feature = "profile")]
+                    end_timer!(timer);
 
+                    #[cfg(feature = "profile")]
+                    let timer = start_timer!(|| "Permutations");
                     // Permutations
                     let sets = &permutation.sets;
                     if !sets.is_empty() {
@@ -372,22 +393,21 @@ impl<C: CurveAffine> Evaluator<C> {
                         let permutation_product_cosets: Vec<
                             Polynomial<C::ScalarExt, LagrangeCoeff>,
                         > = sets
-                            .iter()
+                            .into_par_iter()
                             .map(|set| {
                                 domain.coeff_to_extended_part(
                                     set.permutation_product_poly.clone(),
                                     current_extended_omega,
                                 )
                             })
                             .collect();
-                        let permutation_cosets: Vec<Polynomial<C::ScalarExt, LagrangeCoeff>> = pk
-                            .permutation
-                            .polys
-                            .iter()
-                            .map(|p| {
-                                domain.coeff_to_extended_part(p.clone(), current_extended_omega)
-                            })
-                            .collect();
+                        let permutation_cosets: Vec<Polynomial<C::ScalarExt, LagrangeCoeff>> =
+                            (&pk.permutation.polys)
+                                .into_par_iter()
+                                .map(|p| {
+                                    domain.coeff_to_extended_part(p.clone(), current_extended_omega)
+                                })
+                                .collect();
 
                         let first_set_permutation_product_coset =
                             permutation_product_cosets.first().unwrap();
@@ -473,7 +493,11 @@ impl<C: CurveAffine> Evaluator<C> {
                             }
                         });
                     }
+                    #[cfg(feature = "profile")]
+                    end_timer!(timer);
 
+                    #[cfg(feature = "profile")]
+                    let timer = start_timer!(|| "Lookups");
                     // Lookups
                     for (n, lookup) in lookups.iter().enumerate() {
                         // Polynomials required for this lookup.
@@ -554,6 +578,8 @@ impl<C: CurveAffine> Evaluator<C> {
                             }
                         });
                     }
+                    #[cfg(feature = "profile")]
+                    end_timer!(timer);
                 }
                 current_extended_omega *= extended_omega;
                 values