From 0fa7cf54d5044c7edf7c0a5f5d009f51f66c5653 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Mon, 19 Aug 2024 14:04:12 -0700 Subject: [PATCH 1/9] Added: Benching and testing htd copy vs dtd copy --- stwo_gpu_backend/benches/bit_reverse.rs | 84 ++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/stwo_gpu_backend/benches/bit_reverse.rs b/stwo_gpu_backend/benches/bit_reverse.rs index 60555bf4..c76f35fe 100644 --- a/stwo_gpu_backend/benches/bit_reverse.rs +++ b/stwo_gpu_backend/benches/bit_reverse.rs @@ -1,4 +1,4 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; use itertools::Itertools; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; @@ -6,12 +6,12 @@ use stwo_gpu_backend::{cuda::BaseFieldVec, cuda::SecureFieldVec, CudaBackend}; use stwo_prover::core::backend::{Column, ColumnOps}; use stwo_prover::core::fields::{m31::BaseField, qm31::SecureField}; -pub fn gpu_bit_reverse_base_field(c: &mut Criterion) { +pub fn gpu_bit_reverse_base_field_iter(c: &mut Criterion) { const BITS: usize = 28; let size = 1 << BITS; let mut data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); - c.bench_function(&format!("gpu bit_rev base_field {} bit", BITS), |b| { + c.bench_function(&format!("gpu bit_rev base_field {} bit single reference", BITS), |b| { b.iter(|| { >::bit_reverse_column(&mut data); }) @@ -19,16 +19,86 @@ pub fn gpu_bit_reverse_base_field(c: &mut Criterion) { } pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) { - const BITS: usize = 28; + const BITS: usize = 26; let size = 1 << BITS; let mut rng = SmallRng::seed_from_u64(0); - let mut data = SecureFieldVec::from_vec((0..size).map(|_| rng.gen()).collect()); + let data = SecureFieldVec::from_vec((0..size).map(|_| rng.gen()).collect()); assert_eq!(data.len(), size); c.bench_function(&format!("gpu bit_rev secure_field {} bit", BITS), |b| { + b.iter_batched(|| data.clone(), + |mut data| >::bit_reverse_column(&mut data), + BatchSize::PerIteration + ); + }); +} + +pub fn gpu_bit_reverse_base_field_with_large_drop(c: &mut Criterion) { + const BITS: usize = 28; + let size = 1 << BITS; + let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); + + c.bench_function(&format!("gpu bit_rev base_field with large drop {} bit", BITS), |b| { + b.iter_with_large_drop(|| { + let mut data = data.clone(); + move || { + >::bit_reverse_column(&mut data) + } + }); + }); +} + +pub fn gpu_bit_reverse_base_field_iter_batched_dtd_copy(c: &mut Criterion) { + const BITS: usize = 28; + let size = 1 << BITS; + let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); + + c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup dtd", BITS), |b| { + b.iter_batched(|| + data.clone(), + |mut data| >::bit_reverse_column(&mut data), + BatchSize::PerIteration, + ); + }); +} + +pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) { + const BITS: usize = 28; + let size = 1 << BITS; + //let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); + + c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup htd", BITS), |b| { + b.iter_batched(|| + BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()).clone(), + |mut data| >::bit_reverse_column(&mut data), + BatchSize::PerIteration, + ); + }); +} + +pub fn gpu_bit_reverse_base_field_iter_initializing(c: &mut Criterion) { + const BITS: usize = 28; + let size = 1 << BITS; + let vec = (0..size).map(BaseField::from).collect_vec(); + + c.bench_function(&format!("gpu bit_rev base_field {} bit initializing", BITS), |b| { + b.iter_with_setup( + || vec.clone(), + |cloned_vec| BaseFieldVec::from_vec(cloned_vec) + ) + }); +} + +pub fn gpu_bit_reverse_base_field_iter_cloning(c: &mut Criterion) { + const BITS: usize = 28; + let size = 1 << BITS; + let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); + + c.bench_function(&format!("gpu bit_rev base_field {} bit cloning", BITS), |b| { b.iter(|| { - >::bit_reverse_column(&mut data); + let _ = data.clone(); + }) }); } @@ -36,5 +106,5 @@ pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) { criterion_group!( name = bit_reverse; config = Criterion::default().sample_size(10); - targets = gpu_bit_reverse_base_field, gpu_bit_reverse_secure_field); + targets = gpu_bit_reverse_base_field_iter_initializing,gpu_bit_reverse_base_field_iter_cloning); // gpu_bit_reverse_base_field_iter, gpu_bit_reverse_base_field_with_large_drop, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy); //, gpu_bit_reverse_secure_field); criterion_main!(bit_reverse); From 5f252f69bd3e4851d41a05c188adb56658c7ef85 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Mon, 19 Aug 2024 16:57:40 -0700 Subject: [PATCH 2/9] Added: Eval and Quotient Benchmarking --- stwo_gpu_backend/Cargo.toml | 8 ++++ stwo_gpu_backend/benches/bit_reverse.rs | 56 ++++++++++++++--------- stwo_gpu_backend/benches/eval_at_point.rs | 37 +++++++++++++++ stwo_gpu_backend/benches/quotients.rs | 56 +++++++++++++++++++++++ stwo_gpu_backend/src/column.rs | 7 +-- 5 files changed, 140 insertions(+), 24 deletions(-) create mode 100644 stwo_gpu_backend/benches/eval_at_point.rs create mode 100644 stwo_gpu_backend/benches/quotients.rs diff --git a/stwo_gpu_backend/Cargo.toml b/stwo_gpu_backend/Cargo.toml index 2e5ca9d3..0c2aed64 100644 --- a/stwo_gpu_backend/Cargo.toml +++ b/stwo_gpu_backend/Cargo.toml @@ -26,3 +26,11 @@ harness = false [[bench]] name = "batch_inverse" harness = false + +[[bench]] +name = "eval_at_point" +harness = false + +[[bench]] +name = "quotients" +harness = false \ No newline at end of file diff --git a/stwo_gpu_backend/benches/bit_reverse.rs b/stwo_gpu_backend/benches/bit_reverse.rs index c76f35fe..3d67fb1a 100644 --- a/stwo_gpu_backend/benches/bit_reverse.rs +++ b/stwo_gpu_backend/benches/bit_reverse.rs @@ -6,12 +6,12 @@ use stwo_gpu_backend::{cuda::BaseFieldVec, cuda::SecureFieldVec, CudaBackend}; use stwo_prover::core::backend::{Column, ColumnOps}; use stwo_prover::core::fields::{m31::BaseField, qm31::SecureField}; -pub fn gpu_bit_reverse_base_field_iter(c: &mut Criterion) { +pub fn gpu_bit_reverse_base_field(c: &mut Criterion) { const BITS: usize = 28; let size = 1 << BITS; let mut data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); - c.bench_function(&format!("gpu bit_rev base_field {} bit single reference", BITS), |b| { + c.bench_function(&format!("gpu bit_rev base_field {} bit", BITS), |b| { b.iter(|| { >::bit_reverse_column(&mut data); }) @@ -19,33 +19,47 @@ pub fn gpu_bit_reverse_base_field_iter(c: &mut Criterion) { } pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) { - const BITS: usize = 26; + const BITS: usize = 28; let size = 1 << BITS; let mut rng = SmallRng::seed_from_u64(0); - let data = SecureFieldVec::from_vec((0..size).map(|_| rng.gen()).collect()); + let mut data = SecureFieldVec::from_vec((0..size).map(|_| rng.gen()).collect()); assert_eq!(data.len(), size); c.bench_function(&format!("gpu bit_rev secure_field {} bit", BITS), |b| { - b.iter_batched(|| data.clone(), - |mut data| >::bit_reverse_column(&mut data), - BatchSize::PerIteration - ); + b.iter(|| { + >::bit_reverse_column(&mut data); + }) }); } -pub fn gpu_bit_reverse_base_field_with_large_drop(c: &mut Criterion) { +pub fn cpu_bit_rev(c: &mut Criterion) { + use stwo_prover::core::utils::bit_reverse; + // TODO(andrew): Consider using same size for all. const BITS: usize = 28; let size = 1 << BITS; - let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); + let data = (0..size).map(BaseField::from).collect_vec(); + c.bench_function(&format!("cpu bit_rev {} bit", BITS), |b| { + b.iter_batched( + || data.clone(), + |mut data| bit_reverse(&mut data), + BatchSize::LargeInput, + ); + }); +} - c.bench_function(&format!("gpu bit_rev base_field with large drop {} bit", BITS), |b| { - b.iter_with_large_drop(|| { - let mut data = data.clone(); - move || { - >::bit_reverse_column(&mut data) - } - }); +pub fn simd_bit_rev(c: &mut Criterion) { + use stwo_prover::core::backend::simd::bit_reverse::bit_reverse_m31; + use stwo_prover::core::backend::simd::column::BaseColumn; + const BITS: usize = 28; + let size = 1 << BITS; + let data = (0..size).map(BaseField::from).collect::(); + c.bench_function(&format!("simd bit_rev {} bit", BITS), |b| { + b.iter_batched( + || data.data.clone(), + |mut data| bit_reverse_m31(&mut data), + BatchSize::LargeInput, + ); }); } @@ -58,7 +72,7 @@ pub fn gpu_bit_reverse_base_field_iter_batched_dtd_copy(c: &mut Criterion) { b.iter_batched(|| data.clone(), |mut data| >::bit_reverse_column(&mut data), - BatchSize::PerIteration, + BatchSize::LargeInput, ); }); } @@ -70,9 +84,9 @@ pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) { c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup htd", BITS), |b| { b.iter_batched(|| - BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()).clone(), + BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()), |mut data| >::bit_reverse_column(&mut data), - BatchSize::PerIteration, + BatchSize::LargeInput, ); }); } @@ -106,5 +120,5 @@ pub fn gpu_bit_reverse_base_field_iter_cloning(c: &mut Criterion) { criterion_group!( name = bit_reverse; config = Criterion::default().sample_size(10); - targets = gpu_bit_reverse_base_field_iter_initializing,gpu_bit_reverse_base_field_iter_cloning); // gpu_bit_reverse_base_field_iter, gpu_bit_reverse_base_field_with_large_drop, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy); //, gpu_bit_reverse_secure_field); + targets = cpu_bit_rev, simd_bit_rev, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy); //, gpu_bit_reverse_secure_field); criterion_main!(bit_reverse); diff --git a/stwo_gpu_backend/benches/eval_at_point.rs b/stwo_gpu_backend/benches/eval_at_point.rs new file mode 100644 index 00000000..b5fd9b98 --- /dev/null +++ b/stwo_gpu_backend/benches/eval_at_point.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; +use stwo_gpu_backend::CudaBackend; +use stwo_prover::core::backend::cpu::CpuBackend; +use stwo_prover::core::backend::simd::SimdBackend; +use stwo_prover::core::circle::CirclePoint; +use stwo_prover::core::fields::m31::BaseField; +use stwo_prover::core::poly::circle::{CirclePoly, PolyOps}; + +const LOG_SIZE: u32 = 20; + +fn bench_eval_at_secure_point(c: &mut Criterion, id: &str) { + let poly = CirclePoly::new((0..1 << LOG_SIZE).map(BaseField::from).collect()); + let mut rng = SmallRng::seed_from_u64(0); + let x = rng.gen(); + let y = rng.gen(); + let point = CirclePoint { x, y }; + c.bench_function( + &format!("{id} eval_at_secure_field_point 2^{LOG_SIZE}"), + |b| { + b.iter(|| B::eval_at_point(black_box(&poly), black_box(point))); + }, + ); +} + +fn eval_at_secure_point_benches(c: &mut Criterion) { + bench_eval_at_secure_point::(c, "simd"); + bench_eval_at_secure_point::(c, "cpu"); + bench_eval_at_secure_point::(c, "cuda"); +} + +criterion_group!( + name = eval_at_point; + config = Criterion::default().sample_size(10); + targets = eval_at_secure_point_benches); +criterion_main!(eval_at_point); diff --git a/stwo_gpu_backend/benches/quotients.rs b/stwo_gpu_backend/benches/quotients.rs new file mode 100644 index 00000000..97231f67 --- /dev/null +++ b/stwo_gpu_backend/benches/quotients.rs @@ -0,0 +1,56 @@ +#![feature(iter_array_chunks)] + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use itertools::Itertools; +use stwo_gpu_backend::CudaBackend; +use stwo_prover::core::backend::cpu::CpuBackend; +use stwo_prover::core::backend::simd::SimdBackend; +use stwo_prover::core::circle::SECURE_FIELD_CIRCLE_GEN; +use stwo_prover::core::fields::m31::BaseField; +use stwo_prover::core::fields::qm31::SecureField; +use stwo_prover::core::pcs::quotients::{ColumnSampleBatch, QuotientOps}; +use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation}; +use stwo_prover::core::poly::BitReversedOrder; + +// TODO(andrew): Consider removing const generics and making all sizes the same. +fn bench_quotients( + c: &mut Criterion, + id: &str, +) { + let domain = CanonicCoset::new(LOG_N_ROWS).circle_domain(); + let values = (0..domain.size()).map(BaseField::from).collect(); + let col = CircleEvaluation::::new(domain, values); + let cols = (0..1 << LOG_N_COLS).map(|_| col.clone()).collect_vec(); + let col_refs = cols.iter().collect_vec(); + let random_coeff = SecureField::from_u32_unchecked(0, 1, 2, 3); + let a = SecureField::from_u32_unchecked(5, 6, 7, 8); + let samples = vec![ColumnSampleBatch { + point: SECURE_FIELD_CIRCLE_GEN, + columns_and_values: (0..1 << LOG_N_COLS).map(|i| (i, a)).collect(), + }]; + c.bench_function( + &format!("{id} quotients 2^{LOG_N_COLS} x 2^{LOG_N_ROWS}"), + |b| { + b.iter_with_large_drop(|| { + B::accumulate_quotients( + black_box(domain), + black_box(&col_refs), + black_box(random_coeff), + black_box(&samples), + ) + }) + }, + ); +} + +fn quotients_benches(c: &mut Criterion) { + bench_quotients::(c, "simd"); + bench_quotients::(c, "cpu"); + bench_quotients::(c, "cuda"); +} + +criterion_group!( + name = quotients; + config = Criterion::default().sample_size(10); + targets = quotients_benches); +criterion_main!(quotients); diff --git a/stwo_gpu_backend/src/column.rs b/stwo_gpu_backend/src/column.rs index c69bf611..3eed91da 100644 --- a/stwo_gpu_backend/src/column.rs +++ b/stwo_gpu_backend/src/column.rs @@ -4,7 +4,7 @@ use stwo_prover::core::{ }; use stwo_prover::core::vcs::blake2_hash::Blake2sHash; -use crate::{backend::CudaBackend, cuda}; +use crate::{backend::CudaBackend, cuda::{self, BaseFieldVec}}; impl ColumnOps for CudaBackend { type Column = cuda::BaseFieldVec; @@ -56,8 +56,9 @@ impl Column for cuda::BaseFieldVec { } impl FromIterator for cuda::BaseFieldVec { - fn from_iter>(_iter: T) -> Self { - todo!() + fn from_iter>(iter: T) -> Self { + let vec: Vec = iter.into_iter().collect(); + BaseFieldVec::from_vec(vec) } } From 719ee2829ef20c53bd2b6d8e837ca6b8d6a37f5d Mon Sep 17 00:00:00 2001 From: danielntmd Date: Mon, 26 Aug 2024 21:15:57 -0700 Subject: [PATCH 3/9] new test numbers --- stwo_gpu_backend/benches/quotients.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stwo_gpu_backend/benches/quotients.rs b/stwo_gpu_backend/benches/quotients.rs index 97231f67..3ca2226f 100644 --- a/stwo_gpu_backend/benches/quotients.rs +++ b/stwo_gpu_backend/benches/quotients.rs @@ -44,9 +44,9 @@ fn bench_quotients } fn quotients_benches(c: &mut Criterion) { - bench_quotients::(c, "simd"); - bench_quotients::(c, "cpu"); - bench_quotients::(c, "cuda"); + //bench_quotients::(c, "simd"); + //bench_quotients::(c, "cpu"); + bench_quotients::(c, "cuda"); } criterion_group!( From ecf5f4a14c25dc6fe97769a1c2dfa4df92750ebb Mon Sep 17 00:00:00 2001 From: danielntmd Date: Fri, 30 Aug 2024 15:09:15 -0700 Subject: [PATCH 4/9] Increased Performance: Increase max thread per block by reducing register strain through inlining address reference (changes from struct_impl branch) --- cuda/src/quotients.cu | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu index 3adfd4c8..f2b0d2b3 100644 --- a/cuda/src/quotients.cu +++ b/cuda/src/quotients.cu @@ -74,7 +74,7 @@ __global__ void column_line_and_batch_random_coeffs( line_coeffs_sizes[tid] = sample_batches[tid].size; size_t sample_batches_offset = tid * line_coeffs_sizes[tid] * 3; - qm31 alpha = qm31{cm31{m31{1}, m31{0}}, cm31{m31{0}, m31{0}}}; + qm31 alpha = {{1, 0}, {0, 0}}; for(size_t j = 0; j < sample_batches[tid].size; ++j) { qm31 sampled_value = sample_batches[tid].values[j]; @@ -145,17 +145,14 @@ __global__ void accumulate_quotients_in_gpu( denominator_inverses ); - int i = 0; - - qm31 row_accumulator = qm31{cm31{0, 0}, cm31{0, 0}}; + qm31 row_accumulator = {{0, 0}, {0, 0}}; int line_coeffs_offset = 0; - while (i < sample_size) { - column_sample_batch sample_batch = sample_batches[i]; + + for(int i = 0; i < sample_size; ++i) { qm31 *line_coeffs = &flattened_line_coeffs[line_coeffs_offset * 3]; - qm31 batch_coeff = batch_random_coeffs[i]; int line_coeffs_size = line_coeffs_sizes[i]; + qm31 numerator = {{0, 0}, {0, 0}}; - qm31 numerator = qm31{cm31{0, 0}, cm31{0, 0}}; for(int j = 0; j < line_coeffs_size; j++) { qm31 a = line_coeffs[3 * j + 0]; qm31 b = line_coeffs[3 * j + 1]; @@ -168,16 +165,14 @@ __global__ void accumulate_quotients_in_gpu( numerator = add(numerator, sub(value, linear_term)); } - row_accumulator = add(mul(row_accumulator, batch_coeff), mul(numerator, denominator_inverses[i])); + row_accumulator = add(mul(row_accumulator, batch_random_coeffs[i]), mul(numerator, denominator_inverses[i])); line_coeffs_offset += line_coeffs_size; - i++; } result_column_0[row] = row_accumulator.a.a; result_column_1[row] = row_accumulator.a.b; result_column_2[row] = row_accumulator.b.a; result_column_3[row] = row_accumulator.b.b; - } } @@ -250,9 +245,8 @@ void accumulate_quotients( batch_random_coeffs_device ); - // TODO: set to 1024 - block_dim = 512; - num_blocks = (domain_size + block_dim - 1) / block_dim; + block_dim = domain_size < THREAD_COUNT_MAX ? domain_size : THREAD_COUNT_MAX; + num_blocks = block_dim < THREAD_COUNT_MAX ? 1 : (domain_size + block_dim - 1) / block_dim; accumulate_quotients_in_gpu<<>>( half_coset_initial_index, half_coset_step_size, From 07c966fd862fe9c9ba7fd48df3c489605dda73dc Mon Sep 17 00:00:00 2001 From: danielntmd Date: Fri, 30 Aug 2024 15:11:49 -0700 Subject: [PATCH 5/9] clean --- stwo_gpu_backend/benches/bit_reverse.rs | 32 +++---------------------- stwo_gpu_backend/benches/quotients.rs | 3 +-- 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/stwo_gpu_backend/benches/bit_reverse.rs b/stwo_gpu_backend/benches/bit_reverse.rs index 3d67fb1a..5c298812 100644 --- a/stwo_gpu_backend/benches/bit_reverse.rs +++ b/stwo_gpu_backend/benches/bit_reverse.rs @@ -35,9 +35,9 @@ pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) { pub fn cpu_bit_rev(c: &mut Criterion) { use stwo_prover::core::utils::bit_reverse; - // TODO(andrew): Consider using same size for all. const BITS: usize = 28; let size = 1 << BITS; + let data = (0..size).map(BaseField::from).collect_vec(); c.bench_function(&format!("cpu bit_rev {} bit", BITS), |b| { b.iter_batched( @@ -51,6 +51,7 @@ pub fn cpu_bit_rev(c: &mut Criterion) { pub fn simd_bit_rev(c: &mut Criterion) { use stwo_prover::core::backend::simd::bit_reverse::bit_reverse_m31; use stwo_prover::core::backend::simd::column::BaseColumn; + const BITS: usize = 28; let size = 1 << BITS; let data = (0..size).map(BaseField::from).collect::(); @@ -80,7 +81,6 @@ pub fn gpu_bit_reverse_base_field_iter_batched_dtd_copy(c: &mut Criterion) { pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) { const BITS: usize = 28; let size = 1 << BITS; - //let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup htd", BITS), |b| { b.iter_batched(|| @@ -91,34 +91,8 @@ pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) { }); } -pub fn gpu_bit_reverse_base_field_iter_initializing(c: &mut Criterion) { - const BITS: usize = 28; - let size = 1 << BITS; - let vec = (0..size).map(BaseField::from).collect_vec(); - - c.bench_function(&format!("gpu bit_rev base_field {} bit initializing", BITS), |b| { - b.iter_with_setup( - || vec.clone(), - |cloned_vec| BaseFieldVec::from_vec(cloned_vec) - ) - }); -} - -pub fn gpu_bit_reverse_base_field_iter_cloning(c: &mut Criterion) { - const BITS: usize = 28; - let size = 1 << BITS; - let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()); - - c.bench_function(&format!("gpu bit_rev base_field {} bit cloning", BITS), |b| { - b.iter(|| { - let _ = data.clone(); - - }) - }); -} - criterion_group!( name = bit_reverse; config = Criterion::default().sample_size(10); - targets = cpu_bit_rev, simd_bit_rev, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy); //, gpu_bit_reverse_secure_field); + targets = cpu_bit_rev, simd_bit_rev, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy, gpu_bit_reverse_secure_field); criterion_main!(bit_reverse); diff --git a/stwo_gpu_backend/benches/quotients.rs b/stwo_gpu_backend/benches/quotients.rs index 3ca2226f..a2a72a17 100644 --- a/stwo_gpu_backend/benches/quotients.rs +++ b/stwo_gpu_backend/benches/quotients.rs @@ -12,7 +12,6 @@ use stwo_prover::core::pcs::quotients::{ColumnSampleBatch, QuotientOps}; use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation}; use stwo_prover::core::poly::BitReversedOrder; -// TODO(andrew): Consider removing const generics and making all sizes the same. fn bench_quotients( c: &mut Criterion, id: &str, @@ -46,7 +45,7 @@ fn bench_quotients fn quotients_benches(c: &mut Criterion) { //bench_quotients::(c, "simd"); //bench_quotients::(c, "cpu"); - bench_quotients::(c, "cuda"); + bench_quotients::(c, "cuda"); } criterion_group!( From 379c235a6eb03de22915b6b962a8667895eb7988 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Fri, 30 Aug 2024 15:23:12 -0700 Subject: [PATCH 6/9] Clean --- cuda/src/quotients.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu index f2b0d2b3..14ae8ab1 100644 --- a/cuda/src/quotients.cu +++ b/cuda/src/quotients.cu @@ -38,7 +38,7 @@ void column_sample_batches_for( column_sample_batch *result ) { unsigned int offset = 0; - for (unsigned int index = 0; index < sample_size; index++) { + for (size_t index = 0; index < sample_size; ++index) { result[index].point = sample_points[index]; result[index].columns = &sample_column_indexes[offset]; result[index].values = &sample_column_values[offset]; @@ -98,7 +98,7 @@ __device__ void denominator_inverse( const point domain_point, cm31 *flat_denominators) { - for (unsigned int i = 0; i < sample_size; i++) { + for (size_t i = 0; i < sample_size; ++i) { cm31 prx = sample_batches[i].point.x.a; cm31 pry = sample_batches[i].point.y.a; cm31 pix = sample_batches[i].point.x.b; @@ -147,13 +147,13 @@ __global__ void accumulate_quotients_in_gpu( qm31 row_accumulator = {{0, 0}, {0, 0}}; int line_coeffs_offset = 0; - - for(int i = 0; i < sample_size; ++i) { + + for(size_t i = 0; i < sample_size; ++i) { qm31 *line_coeffs = &flattened_line_coeffs[line_coeffs_offset * 3]; int line_coeffs_size = line_coeffs_sizes[i]; qm31 numerator = {{0, 0}, {0, 0}}; - for(int j = 0; j < line_coeffs_size; j++) { + for(size_t j = 0; j < line_coeffs_size; ++j) { qm31 a = line_coeffs[3 * j + 0]; qm31 b = line_coeffs[3 * j + 1]; qm31 c = line_coeffs[3 * j + 2]; From d91c270733e3cf34d882adb1634c4516702d4e27 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Fri, 30 Aug 2024 15:27:16 -0700 Subject: [PATCH 7/9] Clean --- cuda/src/quotients.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu index 14ae8ab1..4fde2e9d 100644 --- a/cuda/src/quotients.cu +++ b/cuda/src/quotients.cu @@ -38,7 +38,7 @@ void column_sample_batches_for( column_sample_batch *result ) { unsigned int offset = 0; - for (size_t index = 0; index < sample_size; ++index) { + for (unsigned int index = 0; index < sample_size; ++index) { result[index].point = sample_points[index]; result[index].columns = &sample_column_indexes[offset]; result[index].values = &sample_column_values[offset]; @@ -98,7 +98,7 @@ __device__ void denominator_inverse( const point domain_point, cm31 *flat_denominators) { - for (size_t i = 0; i < sample_size; ++i) { + for (unsigned int i = 0; i < sample_size; ++i) { cm31 prx = sample_batches[i].point.x.a; cm31 pry = sample_batches[i].point.y.a; cm31 pix = sample_batches[i].point.x.b; @@ -148,12 +148,12 @@ __global__ void accumulate_quotients_in_gpu( qm31 row_accumulator = {{0, 0}, {0, 0}}; int line_coeffs_offset = 0; - for(size_t i = 0; i < sample_size; ++i) { + for(int i = 0; i < sample_size; ++i) { qm31 *line_coeffs = &flattened_line_coeffs[line_coeffs_offset * 3]; int line_coeffs_size = line_coeffs_sizes[i]; qm31 numerator = {{0, 0}, {0, 0}}; - for(size_t j = 0; j < line_coeffs_size; ++j) { + for(int j = 0; j < line_coeffs_size; ++j) { qm31 a = line_coeffs[3 * j + 0]; qm31 b = line_coeffs[3 * j + 1]; qm31 c = line_coeffs[3 * j + 2]; From b4b7b32f31f1849e03e26a734d0042f1ab897859 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 17 Sep 2024 12:25:54 -0700 Subject: [PATCH 8/9] Added: Error Printing --- cuda/src/quotients.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu index 4fde2e9d..71fd5a6c 100644 --- a/cuda/src/quotients.cu +++ b/cuda/src/quotients.cu @@ -268,6 +268,11 @@ void accumulate_quotients( ); cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } + free(sample_batches); cudaFree(sample_batches_device); cudaFree(denominator_inverses); From 13c6878bc5ca0c4c59385f553fc6dc263ff539fa Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 17 Sep 2024 15:40:49 -0700 Subject: [PATCH 9/9] Added: Launch thread count based on Compute Capability --- cuda/src/quotients.cu | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu index 71fd5a6c..7d1b4620 100644 --- a/cuda/src/quotients.cu +++ b/cuda/src/quotients.cu @@ -3,7 +3,8 @@ #include #define THREAD_COUNT_MAX 1024 - +#define THREAD_COUNT_512 512 +#define COMPUTE_CAPABILITY_8 8 typedef struct { secure_field_point point; uint32_t *columns; @@ -195,6 +196,13 @@ void accumulate_quotients( uint32_t *result_column_3, uint32_t flattened_line_coeffs_size ) { + int device = 0; + cudaSetDevice(device); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + + int domain_log_size = log_2((int)domain_size); auto sample_batches = (column_sample_batch *)malloc(sizeof(column_sample_batch) * sample_size); @@ -245,8 +253,16 @@ void accumulate_quotients( batch_random_coeffs_device ); - block_dim = domain_size < THREAD_COUNT_MAX ? domain_size : THREAD_COUNT_MAX; - num_blocks = block_dim < THREAD_COUNT_MAX ? 1 : (domain_size + block_dim - 1) / block_dim; + // Launch threads based on Compute Capability + if(deviceProp.major >= COMPUTE_CAPABILITY_8) { + block_dim = domain_size < THREAD_COUNT_MAX ? domain_size : THREAD_COUNT_MAX; + num_blocks = block_dim < THREAD_COUNT_MAX ? 1 : (domain_size + block_dim - 1) / block_dim; + } + else { + block_dim = domain_size < THREAD_COUNT_512 ? domain_size : THREAD_COUNT_512; + num_blocks = block_dim < THREAD_COUNT_512 ? 1 : (domain_size + block_dim - 1) / block_dim; + } + accumulate_quotients_in_gpu<<>>( half_coset_initial_index, half_coset_step_size,