Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization: Increase max threads per block #19

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 33 additions & 18 deletions cuda/src/quotients.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#include <cstdio>

#define THREAD_COUNT_MAX 1024

#define THREAD_COUNT_512 512
#define COMPUTE_CAPABILITY_8 8
typedef struct {
secure_field_point point;
uint32_t *columns;
Expand Down Expand Up @@ -38,7 +39,7 @@ void column_sample_batches_for(
column_sample_batch *result
) {
unsigned int offset = 0;
for (unsigned int index = 0; index < sample_size; index++) {
for (unsigned int index = 0; index < sample_size; ++index) {
result[index].point = sample_points[index];
result[index].columns = &sample_column_indexes[offset];
result[index].values = &sample_column_values[offset];
Expand Down Expand Up @@ -74,7 +75,7 @@ __global__ void column_line_and_batch_random_coeffs(
line_coeffs_sizes[tid] = sample_batches[tid].size;
size_t sample_batches_offset = tid * line_coeffs_sizes[tid] * 3;

qm31 alpha = qm31{cm31{m31{1}, m31{0}}, cm31{m31{0}, m31{0}}};
qm31 alpha = {{1, 0}, {0, 0}};

for(size_t j = 0; j < sample_batches[tid].size; ++j) {
qm31 sampled_value = sample_batches[tid].values[j];
Expand All @@ -98,7 +99,7 @@ __device__ void denominator_inverse(
const point domain_point,
cm31 *flat_denominators) {

for (unsigned int i = 0; i < sample_size; i++) {
for (unsigned int i = 0; i < sample_size; ++i) {
cm31 prx = sample_batches[i].point.x.a;
cm31 pry = sample_batches[i].point.y.a;
cm31 pix = sample_batches[i].point.x.b;
Expand Down Expand Up @@ -145,18 +146,15 @@ __global__ void accumulate_quotients_in_gpu(
denominator_inverses
);

int i = 0;

qm31 row_accumulator = qm31{cm31{0, 0}, cm31{0, 0}};
qm31 row_accumulator = {{0, 0}, {0, 0}};
int line_coeffs_offset = 0;
while (i < sample_size) {
column_sample_batch sample_batch = sample_batches[i];

for(int i = 0; i < sample_size; ++i) {
qm31 *line_coeffs = &flattened_line_coeffs[line_coeffs_offset * 3];
qm31 batch_coeff = batch_random_coeffs[i];
int line_coeffs_size = line_coeffs_sizes[i];
qm31 numerator = {{0, 0}, {0, 0}};

qm31 numerator = qm31{cm31{0, 0}, cm31{0, 0}};
for(int j = 0; j < line_coeffs_size; j++) {
for(int j = 0; j < line_coeffs_size; ++j) {
qm31 a = line_coeffs[3 * j + 0];
qm31 b = line_coeffs[3 * j + 1];
qm31 c = line_coeffs[3 * j + 2];
Expand All @@ -168,16 +166,14 @@ __global__ void accumulate_quotients_in_gpu(
numerator = add(numerator, sub(value, linear_term));
}

row_accumulator = add(mul(row_accumulator, batch_coeff), mul(numerator, denominator_inverses[i]));
row_accumulator = add(mul(row_accumulator, batch_random_coeffs[i]), mul(numerator, denominator_inverses[i]));
line_coeffs_offset += line_coeffs_size;
i++;
}

result_column_0[row] = row_accumulator.a.a;
result_column_1[row] = row_accumulator.a.b;
result_column_2[row] = row_accumulator.b.a;
result_column_3[row] = row_accumulator.b.b;

}
}

Expand All @@ -200,6 +196,13 @@ void accumulate_quotients(
uint32_t *result_column_3,
uint32_t flattened_line_coeffs_size
) {
int device = 0;
cudaSetDevice(device);

cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);


int domain_log_size = log_2((int)domain_size);

auto sample_batches = (column_sample_batch *)malloc(sizeof(column_sample_batch) * sample_size);
Expand Down Expand Up @@ -250,9 +253,16 @@ void accumulate_quotients(
batch_random_coeffs_device
);

// TODO: set to 1024
block_dim = 512;
num_blocks = (domain_size + block_dim - 1) / block_dim;
// Launch threads based on Compute Capability
if(deviceProp.major >= COMPUTE_CAPABILITY_8) {
block_dim = domain_size < THREAD_COUNT_MAX ? domain_size : THREAD_COUNT_MAX;
num_blocks = block_dim < THREAD_COUNT_MAX ? 1 : (domain_size + block_dim - 1) / block_dim;
}
else {
block_dim = domain_size < THREAD_COUNT_512 ? domain_size : THREAD_COUNT_512;
num_blocks = block_dim < THREAD_COUNT_512 ? 1 : (domain_size + block_dim - 1) / block_dim;
}

accumulate_quotients_in_gpu<<<num_blocks, block_dim>>>(
half_coset_initial_index,
half_coset_step_size,
Expand All @@ -274,6 +284,11 @@ void accumulate_quotients(
);
cudaDeviceSynchronize();

cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
}

free(sample_batches);
cudaFree(sample_batches_device);
cudaFree(denominator_inverses);
Expand Down
8 changes: 8 additions & 0 deletions stwo_gpu_backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,11 @@ harness = false
[[bench]]
name = "batch_inverse"
harness = false

[[bench]]
name = "eval_at_point"
harness = false

[[bench]]
name = "quotients"
harness = false
62 changes: 60 additions & 2 deletions stwo_gpu_backend/benches/bit_reverse.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use criterion::{criterion_group, criterion_main, Criterion};
use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
use itertools::Itertools;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
Expand Down Expand Up @@ -33,8 +33,66 @@ pub fn gpu_bit_reverse_secure_field(c: &mut Criterion) {
});
}

pub fn cpu_bit_rev(c: &mut Criterion) {
use stwo_prover::core::utils::bit_reverse;
const BITS: usize = 28;
let size = 1 << BITS;

let data = (0..size).map(BaseField::from).collect_vec();
c.bench_function(&format!("cpu bit_rev {} bit", BITS), |b| {
b.iter_batched(
|| data.clone(),
|mut data| bit_reverse(&mut data),
BatchSize::LargeInput,
);
});
}

pub fn simd_bit_rev(c: &mut Criterion) {
use stwo_prover::core::backend::simd::bit_reverse::bit_reverse_m31;
use stwo_prover::core::backend::simd::column::BaseColumn;

const BITS: usize = 28;
let size = 1 << BITS;
let data = (0..size).map(BaseField::from).collect::<BaseColumn>();
c.bench_function(&format!("simd bit_rev {} bit", BITS), |b| {
b.iter_batched(
|| data.data.clone(),
|mut data| bit_reverse_m31(&mut data),
BatchSize::LargeInput,
);
});
}

pub fn gpu_bit_reverse_base_field_iter_batched_dtd_copy(c: &mut Criterion) {
const BITS: usize = 28;
let size = 1 << BITS;
let data = BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec());

c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup dtd", BITS), |b| {
b.iter_batched(||
data.clone(),
|mut data| <CudaBackend as ColumnOps<BaseField>>::bit_reverse_column(&mut data),
BatchSize::LargeInput,
);
});
}

pub fn gpu_bit_reverse_base_field_iter_batched_htd_copy(c: &mut Criterion) {
const BITS: usize = 28;
let size = 1 << BITS;

c.bench_function(&format!("gpu bit_rev base_field {} bit multiple setup htd", BITS), |b| {
b.iter_batched(||
BaseFieldVec::from_vec((0..size).map(BaseField::from).collect_vec()),
|mut data| <CudaBackend as ColumnOps<BaseField>>::bit_reverse_column(&mut data),
BatchSize::LargeInput,
);
});
}

criterion_group!(
name = bit_reverse;
config = Criterion::default().sample_size(10);
targets = gpu_bit_reverse_base_field, gpu_bit_reverse_secure_field);
targets = cpu_bit_rev, simd_bit_rev, gpu_bit_reverse_base_field_iter_batched_dtd_copy, gpu_bit_reverse_base_field_iter_batched_htd_copy, gpu_bit_reverse_secure_field);
criterion_main!(bit_reverse);
37 changes: 37 additions & 0 deletions stwo_gpu_backend/benches/eval_at_point.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
use stwo_gpu_backend::CudaBackend;
use stwo_prover::core::backend::cpu::CpuBackend;
use stwo_prover::core::backend::simd::SimdBackend;
use stwo_prover::core::circle::CirclePoint;
use stwo_prover::core::fields::m31::BaseField;
use stwo_prover::core::poly::circle::{CirclePoly, PolyOps};

const LOG_SIZE: u32 = 20;

fn bench_eval_at_secure_point<B: PolyOps>(c: &mut Criterion, id: &str) {
let poly = CirclePoly::new((0..1 << LOG_SIZE).map(BaseField::from).collect());
let mut rng = SmallRng::seed_from_u64(0);
let x = rng.gen();
let y = rng.gen();
let point = CirclePoint { x, y };
c.bench_function(
&format!("{id} eval_at_secure_field_point 2^{LOG_SIZE}"),
|b| {
b.iter(|| B::eval_at_point(black_box(&poly), black_box(point)));
},
);
}

fn eval_at_secure_point_benches(c: &mut Criterion) {
bench_eval_at_secure_point::<SimdBackend>(c, "simd");
bench_eval_at_secure_point::<CpuBackend>(c, "cpu");
bench_eval_at_secure_point::<CudaBackend>(c, "cuda");
}

criterion_group!(
name = eval_at_point;
config = Criterion::default().sample_size(10);
targets = eval_at_secure_point_benches);
criterion_main!(eval_at_point);
55 changes: 55 additions & 0 deletions stwo_gpu_backend/benches/quotients.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#![feature(iter_array_chunks)]

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use stwo_gpu_backend::CudaBackend;
use stwo_prover::core::backend::cpu::CpuBackend;
use stwo_prover::core::backend::simd::SimdBackend;
use stwo_prover::core::circle::SECURE_FIELD_CIRCLE_GEN;
use stwo_prover::core::fields::m31::BaseField;
use stwo_prover::core::fields::qm31::SecureField;
use stwo_prover::core::pcs::quotients::{ColumnSampleBatch, QuotientOps};
use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation};
use stwo_prover::core::poly::BitReversedOrder;

fn bench_quotients<B: QuotientOps, const LOG_N_ROWS: u32, const LOG_N_COLS: u32>(
c: &mut Criterion,
id: &str,
) {
let domain = CanonicCoset::new(LOG_N_ROWS).circle_domain();
let values = (0..domain.size()).map(BaseField::from).collect();
let col = CircleEvaluation::<B, BaseField, BitReversedOrder>::new(domain, values);
let cols = (0..1 << LOG_N_COLS).map(|_| col.clone()).collect_vec();
let col_refs = cols.iter().collect_vec();
let random_coeff = SecureField::from_u32_unchecked(0, 1, 2, 3);
let a = SecureField::from_u32_unchecked(5, 6, 7, 8);
let samples = vec![ColumnSampleBatch {
point: SECURE_FIELD_CIRCLE_GEN,
columns_and_values: (0..1 << LOG_N_COLS).map(|i| (i, a)).collect(),
}];
c.bench_function(
&format!("{id} quotients 2^{LOG_N_COLS} x 2^{LOG_N_ROWS}"),
|b| {
b.iter_with_large_drop(|| {
B::accumulate_quotients(
black_box(domain),
black_box(&col_refs),
black_box(random_coeff),
black_box(&samples),
)
})
},
);
}

fn quotients_benches(c: &mut Criterion) {
//bench_quotients::<SimdBackend, 20, 8>(c, "simd");
//bench_quotients::<CpuBackend, 20, 8>(c, "cpu");
bench_quotients::<CudaBackend, 20, 8>(c, "cuda");
}

criterion_group!(
name = quotients;
config = Criterion::default().sample_size(10);
targets = quotients_benches);
criterion_main!(quotients);
7 changes: 4 additions & 3 deletions stwo_gpu_backend/src/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use stwo_prover::core::{
};
use stwo_prover::core::vcs::blake2_hash::Blake2sHash;

use crate::{backend::CudaBackend, cuda};
use crate::{backend::CudaBackend, cuda::{self, BaseFieldVec}};

impl ColumnOps<BaseField> for CudaBackend {
type Column = cuda::BaseFieldVec;
Expand Down Expand Up @@ -56,8 +56,9 @@ impl Column<BaseField> for cuda::BaseFieldVec {
}

impl FromIterator<BaseField> for cuda::BaseFieldVec {
fn from_iter<T: IntoIterator<Item = BaseField>>(_iter: T) -> Self {
todo!()
fn from_iter<T: IntoIterator<Item = BaseField>>(iter: T) -> Self {
let vec: Vec<BaseField> = iter.into_iter().collect();
BaseFieldVec::from_vec(vec)
}
}

Expand Down