diff --git a/README.md b/README.md index 0e38ddb..9836f11 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,100 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Lindsay Smith + * [LinkedIn](https://www.linkedin.com/in/lindsay-j-smith/), [personal website](https://lindsays-portfolio-d6aa5d.webflow.io/). +* Tested on: Windows 10, i7-11800H 144Hz 16GB RAM, GeForce RTX 3060 512GB SSD (Personal Laptop) -### (TODO: Your README) +For this project I implemented scan and compact algorithms and compared their various runtimes +on the GPU and CPU, testing with various array sizes and analyzing how efficient each one was. + +Implemenation includes: +* CPU Scan + Stream Compaction +* GPU Naive Scan +* GPU Work-Efficient Scan +* GPU Work-Efficient Compaction +* Thrust Scan (mainly for comparison purposes) + +For all of my analysis I am utilizing a block size of 256, which I found to be optimal. + +![](img/ScanTimes.png) + +In this graph the lower times are better. +We can see that the CPU is actually faster than the GPU for most array sizes. +I did have exceptions to this however in the range of about 2^16 - 2^18, where the GPU scans were faster. Of course we +can also see that the Thrust scan is faster than all of the implementations I wrote, again with an exception at 2^16. +I am not sure the reason for this, but it seems that an array size around 2^16 is somewhat optimal for performance of +my implementations. If I continue with arry sizes into the millions, past what is shown in the graph, the discrepency +in the execution times only widens. + +It is very clear that Thrust has optimizations far beyond my own implementations. With a small data set the differences +are not as apparent, but again once we reach thousands and millions of data points in the array, it is clear how +much more efficient the Thrust algorithm is. I would assume that Thrust takes advantage of memory, as I know that +my implementation could be made more efficient by writing to contiguous memory and allowing the threads that are not working +to be reused. If my implementations took advantage of this I would be able to reduce the number of threads by half at +each kernel call, and therefore would have more threads available to do work for other calls. + +![](img/CompactTimes.png) + +In this graph we also want to note that the lower values are faster runtimes and therefore "better". +It is interesting here that although for scan the CPU was consistently faster than the GPU, once we add the compact step +the CPU is no longer faster. The GPU here is only slower than the CPU for very small array sizes, but once we get into arrays +with thousands and especially millions of elements the GPU is definitively faster. This is probably due to the fact that +the GPU is taking advantages of a very high number of threads, whereas the CPU cannot do that and takes a very long time to +calculate each step of the compact algorithm. + +``` +**************** +** SCAN TESTS ** +**************** + [ 46 29 32 19 41 43 12 31 4 10 2 6 39 ... 0 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.3919ms (std::chrono Measured) + [ 0 46 75 107 126 167 210 222 253 257 267 269 275 ... 6427122 6427122 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.3764ms (std::chrono Measured) + [ 0 46 75 107 126 167 210 222 253 257 267 269 275 ... 6427059 6427087 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.100352ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.098304ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.206848ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.205824ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.171008ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.17728ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 3 0 0 0 3 1 3 2 0 0 2 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.6072ms (std::chrono Measured) + [ 3 3 3 1 3 2 2 3 3 3 3 1 3 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.572ms (std::chrono Measured) + [ 3 3 3 1 3 2 2 3 3 3 3 1 3 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 1.4129ms (std::chrono Measured) + [ 3 3 3 1 3 2 2 3 3 3 3 1 3 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.219264ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.219136ms (CUDA Measured) + passed``` -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/CompactTimes.png b/img/CompactTimes.png new file mode 100644 index 0000000..1f34776 Binary files /dev/null and b/img/CompactTimes.png differ diff --git a/img/ScanTimes.png b/img/ScanTimes.png new file mode 100644 index 0000000..f10ecb0 Binary files /dev/null and b/img/ScanTimes.png differ diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..f1b7a52 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,16 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= n) { + return; + } + if (idata[idx] != 0) { + bools[idx] = 1; + } + else { + bools[idx] = 0; + } } /** @@ -33,6 +43,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= n) { + return; + } + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..932e9d6 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO (DONE) + odata[0] = 0; + for (int i = 0; i < n - 1; i++) { + odata[i + 1] = idata[i] + odata[i]; + } timer().endCpuTimer(); } @@ -30,9 +34,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO (DONE) + int num = 0; + for (int i = 0; i < n; i++) { + if (idata[i]) { + odata[num] = idata[i]; + num++; + } + } timer().endCpuTimer(); - return -1; + return num; } /** @@ -43,8 +54,27 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int *arr = new int[n]; + for (int i = 0; i < n; i++) { + arr[i] = idata[i] ? 1 : 0; + } + + int* scanArr = new int[n]; + //scan(n, scanArr, arr); + scanArr[0] = 0; + for (int i = 0; i < n - 1; i++) { + scanArr[i + 1] = arr[i] + scanArr[i]; + } + + int num = 0; + for (int i = 0; i < n; i++) { + if (arr[i]) { + odata[scanArr[i]] = idata[i]; + num++; + } + } timer().endCpuTimer(); - return -1; + return num; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..92cc5d6 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,65 @@ namespace StreamCompaction { return timer; } + int* dev_buf; + int* dev_bools; + int* dev_idata; + int* dev_odata; + int* dev_scanned; + + __global__ void kernUpSweep(int N, int* data, int offset) { + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= N) { + return; + } + + if (idx % (2 * offset) == 0) { + data[idx + offset * 2 - 1] += data[idx + offset - 1]; + } + } + + __global__ void kernDownSweep(int N, int* data, int offset) { + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= N) { + return; + } + + if (idx % (2 * offset) == 0) { + int temp = data[idx + offset - 1]; + data[idx + offset - 1] = data[idx + offset * 2 - 1]; + data[idx + offset * 2 - 1] += temp; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // TODO (DONE) + + //other variables + int blockSize = 256; + int max = ilog2ceil(n); + int numObj = (int)powf(2, max); + dim3 numBlocks((numObj + blockSize - 1) / blockSize); + + //malloc + memcopy + cudaMalloc((void**)&dev_buf, sizeof(int) * numObj); + cudaMemcpy(dev_buf, idata, numObj * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + //call kernels + for (int i = 0; i < max; i++) { + kernUpSweep<<>>(numObj, dev_buf, (int)powf(2, i)); + } + cudaMemset(dev_buf + numObj - 1, 0, sizeof(int)); + for (int i = max - 1; i >= 0; i--) { + kernDownSweep<<>>(numObj, dev_buf, (int)powf(2, i)); + } timer().endGpuTimer(); + + cudaMemcpy(odata, dev_buf, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_buf); } /** @@ -31,10 +83,53 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + //variables + int blockSize = 256; + int max = ilog2ceil(n); + int numObj = (int)powf(2, max); + dim3 numBlocks((numObj + blockSize - 1) / blockSize); + + //malloc + cudaMalloc((void**)&dev_bools, sizeof(int) * numObj); + cudaMalloc((void**)&dev_idata, sizeof(int) * numObj); + cudaMalloc((void**)&dev_odata, sizeof(int) * numObj); + cudaMalloc((void**)&dev_scanned, sizeof(int) * numObj); + + cudaMemcpy(dev_idata, idata, numObj * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + // TODO (DONE) + StreamCompaction::Common::kernMapToBoolean<<>>(numObj, dev_bools, dev_idata); + cudaMemcpy(dev_scanned, dev_bools, sizeof(int) * numObj, cudaMemcpyDeviceToDevice); + + for (int i = 0; i < max; i++) { + kernUpSweep<<>>(numObj, dev_scanned, (int)powf(2, i)); + } + cudaMemset(dev_scanned + numObj - 1, 0, sizeof(int)); + for (int i = max - 1; i >= 0; i--) { + kernDownSweep<<>>(numObj, dev_scanned, (int)powf(2, i)); + } + + StreamCompaction::Common::kernScatter<<>>(numObj, dev_odata, dev_idata, dev_bools, dev_scanned); + timer().endGpuTimer(); - return -1; + + int* arr = new int[numObj]; + cudaMemcpy(arr, dev_bools, sizeof(int) * numObj, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_scanned); + cudaFree(dev_odata); + + int count = 0; + for (int i = 0; i < n; i++) { + if (arr[i] == 1) { + count++; + } + } + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..6aa5c55 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,56 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + // TODO: (DONE) + __global__ void kernNaiveScan(int N, int* odata, int* idata, int offset) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= N) { + return; + } + if (idx >= offset) { + odata[idx] = idata[idx - offset] + idata[idx]; + } + else { + odata[idx] = idata[idx]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // TODO (DONE) + const int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + int max = ilog2ceil(n); + + //buffers + int* buf1; + int* buf2; + + //malloc + cudaMalloc((void**)&buf1, n * sizeof(int)); + //checkCUDAErrorWithLine("cudaMalloc buf1 failed!"); + cudaMalloc((void**)&buf2, n * sizeof(int)); + //checkCUDAErrorWithLine("cudaMalloc buf2 failed!"); + + //fill array + cudaMemcpy(buf1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + //call kernel timer().startGpuTimer(); - // TODO + for (int i = 1; i <= max; i++) { + kernNaiveScan<<>>(n, buf2, buf1, (int)powf(2, i - 1)); + std::swap(buf1, buf2); + } timer().endGpuTimer(); + + //copy data to odata + odata[0] = 0; + cudaMemcpy(odata + 1, buf1, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(buf1); + cudaFree(buf2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..2fbb3c5 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,13 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //TODO (DONE) + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }