NethermindEth · danielntmd · Aug 19, 2024 · Aug 19, 2024 · Aug 27, 2024 · Aug 30, 2024
diff --git a/cuda/src/quotients.cu b/cuda/src/quotients.cu
@@ -3,7 +3,8 @@
 #include <cstdio>
 
 #define THREAD_COUNT_MAX 1024 
-
+#define THREAD_COUNT_512 512
+#define COMPUTE_CAPABILITY_8 8
 typedef struct {
     secure_field_point point;
     uint32_t *columns;
@@ -38,7 +39,7 @@ void column_sample_batches_for(
         column_sample_batch *result
 ) {
     unsigned int offset = 0;
-    for (unsigned int index = 0; index < sample_size; index++) {
+    for (unsigned int index = 0; index < sample_size; ++index) {
         result[index].point = sample_points[index];
         result[index].columns = &sample_column_indexes[offset];
         result[index].values = &sample_column_values[offset];
@@ -74,7 +75,7 @@ __global__ void column_line_and_batch_random_coeffs(
         line_coeffs_sizes[tid] = sample_batches[tid].size;
         size_t sample_batches_offset = tid * line_coeffs_sizes[tid] * 3; 
 
-        qm31 alpha = qm31{cm31{m31{1}, m31{0}}, cm31{m31{0}, m31{0}}};
+        qm31 alpha = {{1, 0}, {0, 0}};
 
         for(size_t j = 0; j < sample_batches[tid].size; ++j) {
             qm31 sampled_value = sample_batches[tid].values[j];
@@ -98,7 +99,7 @@ __device__ void denominator_inverse(
         const point domain_point,
         cm31 *flat_denominators) {
 
-    for (unsigned int i = 0; i < sample_size; i++) {
+    for (unsigned int i = 0; i < sample_size; ++i) {
         cm31 prx = sample_batches[i].point.x.a;
         cm31 pry = sample_batches[i].point.y.a;
         cm31 pix = sample_batches[i].point.x.b;
@@ -145,18 +146,15 @@ __global__ void accumulate_quotients_in_gpu(
             denominator_inverses
         );
 
-        int i = 0;
-
-        qm31 row_accumulator = qm31{cm31{0, 0}, cm31{0, 0}};
+        qm31 row_accumulator = {{0, 0}, {0, 0}};
         int line_coeffs_offset = 0;
-        while (i < sample_size) {
-            column_sample_batch sample_batch = sample_batches[i];
+
+        for(int i = 0; i < sample_size; ++i) {
             qm31 *line_coeffs = &flattened_line_coeffs[line_coeffs_offset * 3];
-            qm31 batch_coeff = batch_random_coeffs[i];
             int line_coeffs_size = line_coeffs_sizes[i];
+            qm31 numerator = {{0, 0}, {0, 0}};
 
-            qm31 numerator = qm31{cm31{0, 0}, cm31{0, 0}};
-            for(int j = 0; j < line_coeffs_size; j++) {
+            for(int j = 0; j < line_coeffs_size; ++j) {
                 qm31 a = line_coeffs[3 * j + 0];
                 qm31 b = line_coeffs[3 * j + 1];
                 qm31 c = line_coeffs[3 * j + 2];
@@ -168,16 +166,14 @@ __global__ void accumulate_quotients_in_gpu(
                 numerator = add(numerator, sub(value, linear_term));
             }
 
-            row_accumulator = add(mul(row_accumulator, batch_coeff), mul(numerator, denominator_inverses[i]));
+            row_accumulator = add(mul(row_accumulator, batch_random_coeffs[i]), mul(numerator, denominator_inverses[i]));
             line_coeffs_offset += line_coeffs_size;
-            i++;
         }
 
         result_column_0[row] = row_accumulator.a.a;
         result_column_1[row] = row_accumulator.a.b;
         result_column_2[row] = row_accumulator.b.a;
         result_column_3[row] = row_accumulator.b.b;
-
     }
 }
 
@@ -200,6 +196,13 @@ void accumulate_quotients(
         uint32_t *result_column_3,
         uint32_t flattened_line_coeffs_size
 ) {
+    int device = 0;
+    cudaSetDevice(device);
+
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+
+
     int domain_log_size = log_2((int)domain_size);
 
     auto sample_batches = (column_sample_batch *)malloc(sizeof(column_sample_batch) * sample_size);
@@ -250,9 +253,16 @@ void accumulate_quotients(
             batch_random_coeffs_device
     );
 
-    // TODO: set to 1024
-    block_dim = 512;
-    num_blocks = (domain_size + block_dim - 1) / block_dim;
+    // Launch threads based on Compute Capability
+    if(deviceProp.major >= COMPUTE_CAPABILITY_8) {
+        block_dim = domain_size < THREAD_COUNT_MAX ? domain_size : THREAD_COUNT_MAX; 
+        num_blocks = block_dim < THREAD_COUNT_MAX ? 1 : (domain_size + block_dim - 1) / block_dim;
+    }
+    else {
+        block_dim = domain_size < THREAD_COUNT_512 ? domain_size : THREAD_COUNT_512; 
+        num_blocks = block_dim < THREAD_COUNT_512 ? 1 : (domain_size + block_dim - 1) / block_dim;
+    }
+
     accumulate_quotients_in_gpu<<<num_blocks, block_dim>>>(
             half_coset_initial_index,
             half_coset_step_size,
@@ -274,6 +284,11 @@ void accumulate_quotients(
     );
     cudaDeviceSynchronize();
 
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+
     free(sample_batches);
     cudaFree(sample_batches_device);
     cudaFree(denominator_inverses);

diff --git a/stwo_gpu_backend/Cargo.toml b/stwo_gpu_backend/Cargo.toml
@@ -26,3 +26,11 @@ harness = false
 [[bench]]
 name = "batch_inverse"
 harness = false
+
+[[bench]]
+name = "eval_at_point"
+harness = false
+
+[[bench]]
+name = "quotients"
+harness = false
diff --git a/stwo_gpu_backend/benches/bit_reverse.rs b/stwo_gpu_backend/benches/bit_reverse.rs
@@ -1,4 +1,4 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
 use itertools::Itertools;
 use rand::rngs::SmallRng;
 use rand::{Rng, SeedableRng};
@@ -33,8 +33,66 @@ pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) {
     });
 }
 
+pub fn cpu_bit_rev(c: &mut Criterion) {
+    use stwo_prover::core::utils::bit_reverse;
+    const BITS: usize = 28;
+    let size = 1 << BITS;
+
+    let data = (0..size).map(BaseField::from).collect_vec();
+    c.bench_function(&format!("cpu bit_rev {} bit", BITS), |b| {
+        b.iter_batched(
+            || data.clone(),
+            |mut data| bit_reverse(&mut data),
+            BatchSize::LargeInput,
+        );
+    });
+}
+
+pub fn simd_bit_rev(c: &mut Criterion) {
+    use stwo_prover::core::backend::simd::bit_reverse::bit_reverse_m31;
+    use stwo_prover::core::backend::simd::column::BaseColumn;
+
+    const BITS: usize = 28;
+    let size = 1 << BITS;
+    let data = (0..size).map(BaseField::from).collect::<BaseColumn>();
+    c.bench_function(&format!("simd bit_rev {} bit", BITS), |b| {
+        b.iter_batched(
+            || data.data.clone(),
+            |mut data| bit_reverse_m31(&mut data),
+            BatchSize::LargeInput,
+        );
+    });
+}
+
+pub fn gpu_bit_reverse_base_field_iter_batched_dtd_copy(c: &mut Criterion) {
+    const BITS: usize = 28;
+    let size = 1 << BITS;
+    let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec());
+
+    c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup dtd", BITS), |b| {
+        b.iter_batched(|| 
+            data.clone(), 
+            |mut data| <CudaBackend as ColumnOps<BaseField>>::bit_reverse_column(&mut data),
+            BatchSize::LargeInput, 
+        );
+    });
+}
+
+pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) {
+    const BITS: usize = 28;
+    let size = 1 << BITS;
+
+    c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup htd", BITS), |b| {
+        b.iter_batched(|| 
+            BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()), 
+            |mut data| <CudaBackend as ColumnOps<BaseField>>::bit_reverse_column(&mut data),
+            BatchSize::LargeInput, 
+        );
+    });
+}
+
 criterion_group!(
     name = bit_reverse;
     config = Criterion::default().sample_size(10);
-    targets = gpu_bit_reverse_base_field, gpu_bit_reverse_secure_field);
+    targets = cpu_bit_rev, simd_bit_rev, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy, gpu_bit_reverse_secure_field);
 criterion_main!(bit_reverse);
diff --git a/stwo_gpu_backend/benches/eval_at_point.rs b/stwo_gpu_backend/benches/eval_at_point.rs
@@ -0,0 +1,37 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
+use stwo_gpu_backend::CudaBackend;
+use stwo_prover::core::backend::cpu::CpuBackend;
+use stwo_prover::core::backend::simd::SimdBackend;
+use stwo_prover::core::circle::CirclePoint;
+use stwo_prover::core::fields::m31::BaseField;
+use stwo_prover::core::poly::circle::{CirclePoly, PolyOps};
+
+const LOG_SIZE: u32 = 20;
+
+fn bench_eval_at_secure_point<B: PolyOps>(c: &mut Criterion, id: &str) {
+    let poly = CirclePoly::new((0..1 << LOG_SIZE).map(BaseField::from).collect());
+    let mut rng = SmallRng::seed_from_u64(0);
+    let x = rng.gen();
+    let y = rng.gen();
+    let point = CirclePoint { x, y };
+    c.bench_function(
+        &format!("{id} eval_at_secure_field_point 2^{LOG_SIZE}"),
+        |b| {
+            b.iter(|| B::eval_at_point(black_box(&poly), black_box(point)));
+        },
+    );
+}
+
+fn eval_at_secure_point_benches(c: &mut Criterion) {
+    bench_eval_at_secure_point::<SimdBackend>(c, "simd");
+    bench_eval_at_secure_point::<CpuBackend>(c, "cpu");
+    bench_eval_at_secure_point::<CudaBackend>(c, "cuda");
+}
+
+criterion_group!(
+        name = eval_at_point;
+        config = Criterion::default().sample_size(10);
+        targets = eval_at_secure_point_benches);
+criterion_main!(eval_at_point);
diff --git a/stwo_gpu_backend/benches/quotients.rs b/stwo_gpu_backend/benches/quotients.rs
@@ -0,0 +1,55 @@
+#![feature(iter_array_chunks)]
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use itertools::Itertools;
+use stwo_gpu_backend::CudaBackend;
+use stwo_prover::core::backend::cpu::CpuBackend;
+use stwo_prover::core::backend::simd::SimdBackend;
+use stwo_prover::core::circle::SECURE_FIELD_CIRCLE_GEN;
+use stwo_prover::core::fields::m31::BaseField;
+use stwo_prover::core::fields::qm31::SecureField;
+use stwo_prover::core::pcs::quotients::{ColumnSampleBatch, QuotientOps};
+use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation};
+use stwo_prover::core::poly::BitReversedOrder;
+
+fn bench_quotients<B: QuotientOps, const LOG_N_ROWS: u32, const LOG_N_COLS: u32>(
+    c: &mut Criterion,
+    id: &str,
+) {
+    let domain = CanonicCoset::new(LOG_N_ROWS).circle_domain();
+    let values = (0..domain.size()).map(BaseField::from).collect();
+    let col = CircleEvaluation::<B, BaseField, BitReversedOrder>::new(domain, values);
+    let cols = (0..1 << LOG_N_COLS).map(|_| col.clone()).collect_vec();
+    let col_refs = cols.iter().collect_vec();
+    let random_coeff = SecureField::from_u32_unchecked(0, 1, 2, 3);
+    let a = SecureField::from_u32_unchecked(5, 6, 7, 8);
+    let samples = vec![ColumnSampleBatch {
+        point: SECURE_FIELD_CIRCLE_GEN,
+        columns_and_values: (0..1 << LOG_N_COLS).map(|i| (i, a)).collect(),
+    }];
+    c.bench_function(
+        &format!("{id} quotients 2^{LOG_N_COLS} x 2^{LOG_N_ROWS}"),
+        |b| {
+            b.iter_with_large_drop(|| {
+                B::accumulate_quotients(
+                    black_box(domain),
+                    black_box(&col_refs),
+                    black_box(random_coeff),
+                    black_box(&samples),
+                )
+            })
+        },
+    );
+}
+
+fn quotients_benches(c: &mut Criterion) {
+   //bench_quotients::<SimdBackend, 20, 8>(c, "simd");
+    //bench_quotients::<CpuBackend, 20, 8>(c, "cpu");
+    bench_quotients::<CudaBackend, 20, 8>(c, "cuda");
+}
+
+criterion_group!(
+    name = quotients;
+    config = Criterion::default().sample_size(10);
+    targets = quotients_benches);
+criterion_main!(quotients);
diff --git a/stwo_gpu_backend/src/column.rs b/stwo_gpu_backend/src/column.rs
@@ -4,7 +4,7 @@ use stwo_prover::core::{
 };
 use stwo_prover::core::vcs::blake2_hash::Blake2sHash;
 
-use crate::{backend::CudaBackend, cuda};
+use crate::{backend::CudaBackend, cuda::{self, BaseFieldVec}};
 
 impl ColumnOps<BaseField> for CudaBackend {
     type Column = cuda::BaseFieldVec;
@@ -56,8 +56,9 @@ impl Column<BaseField> for cuda::BaseFieldVec {
 }
 
 impl FromIterator<BaseField> for cuda::BaseFieldVec {
-    fn from_iter<T: IntoIterator<Item = BaseField>>(_iter: T) -> Self {
-        todo!()
+    fn from_iter<T: IntoIterator<Item = BaseField>>(iter: T) -> Self {
+        let vec: Vec<BaseField> = iter.into_iter().collect();
+        BaseFieldVec::from_vec(vec)
     }
 }