Skip to content

Commit

Permalink
reorganized code
Browse files Browse the repository at this point in the history
  • Loading branch information
ostueker committed Jun 24, 2024
1 parent 910b02d commit 3c06558
Show file tree
Hide file tree
Showing 20 changed files with 757 additions and 4 deletions.
4 changes: 2 additions & 2 deletions _episodes/07-architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Let's run some device diagnostics on a V100 GPU to print out some of its propert
> ## Device diagnostic code `device_diagnostic.cu`
>
> This is the code for `device_diagnostic.cu` that can also be downloaded from:
> https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/device_diagnostic.cu
> https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/07-architecture/device_diagnostic.cu
>
> ~~~~
> /*
Expand Down Expand Up @@ -110,7 +110,7 @@ Let's run some device diagnostics on a V100 GPU to print out some of its propert
$ cd ~/scratch
$ mkdir diagnostics
$ cd diagnostics
$ wget https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/device_diagnostic.cu
$ wget https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/07-architecture/device_diagnostic.cu
$ nvcc device_diagnostic.cu -o device_diagnostic
$ srun --time=5 --gres=gpu:1 ./device_diagnostic
~~~~
Expand Down
2 changes: 1 addition & 1 deletion _episodes/09-memory-performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ locations consecutive memory addresses (or as close as possible).

> ## Exercise: Memory access patterns
>
> Is this a a good memory access pattern?
> Is this a good memory access pattern?
>
> ~~~
> x = blockIdx.x * blockDim.x + threadIDx.x;
Expand Down
2 changes: 1 addition & 1 deletion _episodes/10-exercise-julia-set.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ keypoints:
In the first session of the ACENET Summer School we have been introduced to the
[Julia set](https://acenet-arc.github.io/ACENET_Summer_School_General/05-performance/index.html#example-generating-an-image-of-a-julia-set) as an example to demonstrate weak and strong scaling.

At `https://acenet-arc.github.io/ACENET_Summer_School_GPGPU/code/2_julia/julia_cpu.cu` we have implementation of the Julia set for calculation on CPUs.
At `https://acenet-arc.github.io/ACENET_Summer_School_GPGPU/code/10-exercise-julia-set/julia_cpu.cu` we have implementation of the Julia set for calculation on CPUs.

The goal of this exercise is to adapt this file for computation on CPUs.

Expand Down
6 changes: 6 additions & 0 deletions code/02-hello-world/hello_world.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char **argv) {
printf("Hello World\n");
}
10 changes: 10 additions & 0 deletions code/02-hello-world/hello_world.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include <stdio.h>
#include <stdlib.h>

__global__ void mykernel(void) {
}

int main(int argc, char **argv) {
mykernel<<<1,1>>>();
printf("Hello world\n");
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
50 changes: 50 additions & 0 deletions code/05-using-blocks/addvec_blocks_solution.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *da, int *db, int *dc)
{
dc[blockIdx.x] = da[blockIdx.x] + db[blockIdx.x];
}

int main(int argc, char **argv)
{
int a_in = atoi(argv[1]); // first addend
int b_in = atoi(argv[2]); // second addend
int N = atoi(argv[3]); // length of arrays
int numBlocks = 512;

int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);

// Initialize the input vectors
for (int i = 0; i < N; ++i)
{
a[i] = a_in;
b[i] = b_in;
c[i] = 0;
}

cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

add<<<numBlocks, 1>>>(d_a, d_b, d_c);
cudaDeviceSynchronize();
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

printf("%d + %d = %d\n", a[0], b[0], c[0]);
printf("...\n");
printf("%d + %d = %d\n", a[N - 1], b[N - 1], c[N - 1]);
free(a);
free(b);
free(c);
}
85 changes: 85 additions & 0 deletions code/06-all-together/addvec_final_solution.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int N, int *da, int *db, int *dc)
{
// This is a CUDA idiom called the grid-stride loop.
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < N; i += stride)
dc[i] = da[i] + db[i];
}

int main(int argc, char **argv)
{
// Read values from cmd line.
if (argc < 6)
{
printf("Usage:\n %s a b N threads blocks\n", argv[0]);
return (-1);
}
int a_in = atoi(argv[1]);
int b_in = atoi(argv[2]);
int N = atoi(argv[3]);
int numThreads = atoi(argv[4]);
int numBlocks = atoi(argv[5]);
// Or to get the block count that covers N elements:
// int numBlocks = (N + numThreads - 1) / numThreads;

// Calculate size of arrays in bytes.
int size = N * sizeof(int);
// Allocate host storage.
int *a, *b, *c;
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
// Initialize the input vectors.
for (int i = 0; i < N; ++i)
{
a[i] = a_in;
b[i] = b_in;
c[i] = 0;
}

// Allocate device storage.
int *da, *db, *dc;
cudaMalloc((void **)&da, size);
cudaMalloc((void **)&db, size);
cudaMalloc((void **)&dc, size);

// Copy data to GPU.
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);

// Execute the kernel on the GPU.
add<<<numBlocks, numThreads>>>(N, da, db, dc);
cudaDeviceSynchronize();

// Copy results back from GPU.
cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

// Print results from each end of the array.
printf("%d plus %d equals %d\n", a[0], b[0], c[0]);
printf(" ...\n");
printf("%d plus %d equals %d\n", a[N - 1], b[N - 1], c[N - 1]);

// Check for stray errors somewhere in the middle.
// We won't check them all, quit after first error.
int expected = a_in + b_in;
for (int i = 0; i < N; ++i)
{
if (c[i] != expected)
{
printf("Wrong sum %d at element %d!\n", c[i], i);
break;
}
}

// Free all allocated memory.
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
60 changes: 60 additions & 0 deletions code/09-memory-performance/cublas.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <cuda.h> /* CUDA runtime API */
#include <cstdio>
#include <cublas_v2.h>
int main(int argc, char *argv[])
{
float *x_host, *y_host; /* arrays for computation on host*/
float *x_dev, *y_dev;
/* arrays for computation on device */
int n = 1024*1024;
float alpha = 0.5f;
int nerror;
size_t memsize;
int i;
/* could add device detection here */
memsize = n * sizeof(float);

/* allocate arrays on host */
x_host = (float *)malloc(memsize);
y_host = (float *)malloc(memsize);

/* allocate arrays on device */
cudaMalloc((void **) &x_dev, memsize);
cudaMalloc((void **) &y_dev, memsize);

/* initialize arrays on host */
for ( i = 0; i < n; i++)
{
x_host[i] = rand() / (float)RAND_MAX;
y_host[i] = rand() / (float)RAND_MAX;
}

/* copy arrays to device memory (synchronous) */
cudaMemcpy(x_dev, x_host, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(y_dev, y_host, memsize, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status;
status = cublasCreate(&handle);
int stride = 1;
status = cublasSaxpy(handle,n,&alpha,x_dev,stride,y_dev,stride);

/* check if cublasSaxpy launched succesfully */
if (status != CUBLAS_STATUS_SUCCESS)
{
printf ("Error in launching CUBLAS routine \n");
exit (20);
}

status = cublasDestroy(handle);
/* retrieve results from device (synchronous) */
cudaMemcpy(y_host, y_dev, memsize, cudaMemcpyDeviceToHost);
/* ensure synchronization (cudaMemcpy is synchronous in most cases, but not all) */
cudaDeviceSynchronize();
/* use data in y_host*/
/* free memory */
cudaFree(x_dev);
cudaFree(y_dev);
free(x_host);
free(y_host);
return 0;
}
Loading

0 comments on commit 3c06558

Please sign in to comment.