reorganized code

acenet-arc · Jun 24, 2024 · 3c06558 · 3c06558
1 parent 910b02d
commit 3c06558
Show file tree

Hide file tree

Showing 20 changed files with 757 additions and 4 deletions.
diff --git a/_episodes/07-architecture.md b/_episodes/07-architecture.md
@@ -43,7 +43,7 @@ Let's run some device diagnostics on a V100 GPU to print out some of its propert
 > ## Device diagnostic code `device_diagnostic.cu`
 > 
 > This is the code for `device_diagnostic.cu` that can also be downloaded from:
-> https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/device_diagnostic.cu
+> https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/07-architecture/device_diagnostic.cu
 >
 > ~~~~
 > /*
@@ -110,7 +110,7 @@ Let's run some device diagnostics on a V100 GPU to print out some of its propert
 $ cd ~/scratch
 $ mkdir diagnostics
 $ cd diagnostics
-$ wget https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/device_diagnostic.cu
+$ wget https://raw.githubusercontent.com/acenet-arc/ACENET_Summer_School_GPGPU/gh-pages/code/07-architecture/device_diagnostic.cu
 $ nvcc device_diagnostic.cu  -o device_diagnostic
 $ srun --time=5 --gres=gpu:1  ./device_diagnostic
 ~~~~

diff --git a/_episodes/09-memory-performance.md b/_episodes/09-memory-performance.md
@@ -39,7 +39,7 @@ locations consecutive memory addresses (or as close as possible).
 
 > ## Exercise: Memory access patterns
 >
-> Is this a a good memory access pattern?
+> Is this a good memory access pattern?
 >
 > ~~~
 > x = blockIdx.x * blockDim.x + threadIDx.x;

diff --git a/_episodes/10-exercise-julia-set.md b/_episodes/10-exercise-julia-set.md
@@ -13,7 +13,7 @@ keypoints:
 In the first session of the ACENET Summer School we have been introduced to the 
 [Julia set](https://acenet-arc.github.io/ACENET_Summer_School_General/05-performance/index.html#example-generating-an-image-of-a-julia-set) as an example to demonstrate weak and strong scaling.
 
-At `https://acenet-arc.github.io/ACENET_Summer_School_GPGPU/code/2_julia/julia_cpu.cu` we have implementation of the Julia set for calculation on CPUs.
+At `https://acenet-arc.github.io/ACENET_Summer_School_GPGPU/code/10-exercise-julia-set/julia_cpu.cu` we have implementation of the Julia set for calculation on CPUs.
 
 The goal of this exercise is to adapt this file for computation on CPUs.
 

diff --git a/code/02-hello-world/hello_world.c b/code/02-hello-world/hello_world.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+   printf("Hello World\n");
+}
diff --git a/code/02-hello-world/hello_world.cu b/code/02-hello-world/hello_world.cu
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void mykernel(void) {
+}
+
+int main(int argc, char **argv) {
+   mykernel<<<1,1>>>();
+   printf("Hello world\n");
+}
diff --git a/code/add_solution.cu → code/03-adding-integers/add_solution.cu b/code/add_solution.cu → code/03-adding-integers/add_solution.cu
diff --git a/code/add_template.cu → code/03-adding-integers/add_template.cu b/code/add_template.cu → code/03-adding-integers/add_template.cu
diff --git a/code/addvec_solution.cu → code/04-adding-vectors/addvec_solution.cu b/code/addvec_solution.cu → code/04-adding-vectors/addvec_solution.cu
diff --git a/code/addvec_template.cu → code/04-adding-vectors/addvec_template.cu b/code/addvec_template.cu → code/04-adding-vectors/addvec_template.cu
diff --git a/code/05-using-blocks/addvec_blocks_solution.cu b/code/05-using-blocks/addvec_blocks_solution.cu
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void add(int *da, int *db, int *dc)
+{
+    dc[blockIdx.x] = da[blockIdx.x] + db[blockIdx.x];
+}
+
+int main(int argc, char **argv)
+{
+    int a_in = atoi(argv[1]); // first addend
+    int b_in = atoi(argv[2]); // second addend
+    int N = atoi(argv[3]);    // length of arrays
+    int numBlocks = 512;
+
+    int *a, *b, *c;
+    int *d_a, *d_b, *d_c;
+    int size = N * sizeof(int);
+    a = (int *)malloc(size);
+    b = (int *)malloc(size);
+    c = (int *)malloc(size);
+
+    // Initialize the input vectors
+    for (int i = 0; i < N; ++i)
+    {
+        a[i] = a_in;
+        b[i] = b_in;
+        c[i] = 0;
+    }
+
+    cudaMalloc((void **)&d_a, size);
+    cudaMalloc((void **)&d_b, size);
+    cudaMalloc((void **)&d_c, size);
+    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
+
+    add<<<numBlocks, 1>>>(d_a, d_b, d_c);
+    cudaDeviceSynchronize();
+    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    printf("%d + %d = %d\n", a[0], b[0], c[0]);
+    printf("...\n");
+    printf("%d + %d = %d\n", a[N - 1], b[N - 1], c[N - 1]);
+    free(a);
+    free(b);
+    free(c);
+}
diff --git a/code/06-all-together/addvec_final_solution.cu b/code/06-all-together/addvec_final_solution.cu
@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void add(int N, int *da, int *db, int *dc)
+{
+    // This is a CUDA idiom called the grid-stride loop.
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = index; i < N; i += stride)
+        dc[i] = da[i] + db[i];
+}
+
+int main(int argc, char **argv)
+{
+    // Read values from cmd line.
+    if (argc < 6)
+    {
+        printf("Usage:\n %s a b N threads blocks\n", argv[0]);
+        return (-1);
+    }
+    int a_in = atoi(argv[1]);
+    int b_in = atoi(argv[2]);
+    int N = atoi(argv[3]);
+    int numThreads = atoi(argv[4]);
+    int numBlocks = atoi(argv[5]);
+    // Or to get the block count that covers N elements:
+    // int numBlocks = (N + numThreads - 1) / numThreads;
+
+    // Calculate size of arrays in bytes.
+    int size = N * sizeof(int);
+    // Allocate host storage.
+    int *a, *b, *c;
+    a = (int *)malloc(size);
+    b = (int *)malloc(size);
+    c = (int *)malloc(size);
+    // Initialize the input vectors.
+    for (int i = 0; i < N; ++i)
+    {
+        a[i] = a_in;
+        b[i] = b_in;
+        c[i] = 0;
+    }
+
+    // Allocate device storage.
+    int *da, *db, *dc;
+    cudaMalloc((void **)&da, size);
+    cudaMalloc((void **)&db, size);
+    cudaMalloc((void **)&dc, size);
+
+    // Copy data to GPU.
+    cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
+
+    // Execute the kernel on the GPU.
+    add<<<numBlocks, numThreads>>>(N, da, db, dc);
+    cudaDeviceSynchronize();
+
+    // Copy results back from GPU.
+    cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);
+
+    // Print results from each end of the array.
+    printf("%d plus %d equals %d\n", a[0], b[0], c[0]);
+    printf(" ...\n");
+    printf("%d plus %d equals %d\n", a[N - 1], b[N - 1], c[N - 1]);
+
+    // Check for stray errors somewhere in the middle.
+    // We won't check them all, quit after first error.
+    int expected = a_in + b_in;
+    for (int i = 0; i < N; ++i)
+    {
+        if (c[i] != expected)
+        {
+            printf("Wrong sum %d at element %d!\n", c[i], i);
+            break;
+        }
+    }
+
+    // Free all allocated memory.
+    cudaFree(da);
+    cudaFree(db);
+    cudaFree(dc);
+    free(a);
+    free(b);
+    free(c);
+}
diff --git a/code/0_device_diagnostic/Makefile → code/07-architecture/Makefile b/code/0_device_diagnostic/Makefile → code/07-architecture/Makefile
diff --git a/.../0_device_diagnostic/device_diagnostic.cu → code/07-architecture/device_diagnostic.cu b/.../0_device_diagnostic/device_diagnostic.cu → code/07-architecture/device_diagnostic.cu
diff --git a/code/0_device_diagnostic/readme.txt → code/07-architecture/readme.txt b/code/0_device_diagnostic/readme.txt → code/07-architecture/readme.txt
diff --git a/code/09-memory-performance/cublas.cu b/code/09-memory-performance/cublas.cu
@@ -0,0 +1,60 @@
+#include <cuda.h> /* CUDA runtime API */
+#include <cstdio>
+#include <cublas_v2.h>
+int main(int argc, char *argv[])
+{
+    float *x_host, *y_host; /* arrays for computation on host*/
+    float *x_dev, *y_dev;
+    /* arrays for computation on device */
+    int n = 1024*1024;
+    float alpha = 0.5f;
+    int nerror;
+    size_t memsize;
+    int i;
+    /* could add device detection here */
+    memsize = n * sizeof(float);
+
+    /* allocate arrays on host */
+    x_host = (float *)malloc(memsize);
+    y_host = (float *)malloc(memsize);
+
+    /* allocate arrays on device */
+    cudaMalloc((void **) &x_dev, memsize);
+    cudaMalloc((void **) &y_dev, memsize);
+
+    /* initialize arrays on host */
+    for ( i = 0; i < n; i++)
+    {
+        x_host[i] = rand() / (float)RAND_MAX;
+        y_host[i] = rand() / (float)RAND_MAX;
+    }
+
+    /* copy arrays to device memory (synchronous) */
+    cudaMemcpy(x_dev, x_host, memsize, cudaMemcpyHostToDevice);
+    cudaMemcpy(y_dev, y_host, memsize, cudaMemcpyHostToDevice);
+    cublasHandle_t handle;
+    cublasStatus_t status;
+    status = cublasCreate(&handle);
+    int stride = 1;
+    status = cublasSaxpy(handle,n,&alpha,x_dev,stride,y_dev,stride);
+
+    /* check if cublasSaxpy launched succesfully */
+    if (status != CUBLAS_STATUS_SUCCESS)
+    {
+        printf ("Error in launching CUBLAS routine \n");
+        exit (20);
+    }
+
+    status = cublasDestroy(handle);
+    /* retrieve results from device (synchronous) */
+    cudaMemcpy(y_host, y_dev, memsize, cudaMemcpyDeviceToHost);
+    /* ensure synchronization (cudaMemcpy is synchronous in most cases, but not all) */
+    cudaDeviceSynchronize();
+    /* use data in y_host*/
+    /* free memory */
+    cudaFree(x_dev);
+    cudaFree(y_dev);
+    free(x_host);
+    free(y_host);
+    return 0;
+}