diff --git a/README.md b/README.md index 0e38ddb..f39329a 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,61 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Matt Elser + * [LinkedIn](https://www.linkedin.com/in/matt-elser-97b8151ba/), [twitter](twitter.com/__mattelser__) +* Tested on: Tested on: Ubuntu 20.04, i3-10100F @ 3.6GHz 16GB, GeForce 1660 Super 6GB -### (TODO: Your README) +![timing data table](img/timingData.png) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Main Features +This project implements an exclusive scan (performing an operation on (in this case a sum of) all previous +elements along an array) and stream compaction (removing elements from an array based on a condition) using +the CPU, GPU with CUDA, and the CUDA powered library `Thrust`. +- all required algorithms work +- Efficient scan and compaction are faster than CPU implementation for arrays of sufficient size +- Radix sort has been implemented (without tiling or bitonic merge) +### Time Comparison +The test setup in `main.cpp` has been coopted to set up repeated timings to remove noise from measurements. + +![scan comparison plot](img/scanPlot.png) +![compact comparison plot](img/compactPlot.png) + +CPU scan and compact may be faster for smaller arrays, but the efficient GPU algorithms become more efficient +at array size 2^16 for compact and 2^17 for efficient scan. + +### Block Size Comparison +changing the number of threads per blocks did not have a noticeable impact on timing. Minor differences are +can be seen in the graph below, but almost all are around one standard deviation of another, so this may just +be noise. Data was gathered by timing 100 runs of each algorithm with an array of size 2^22, see the bottom of +the readme for the data table. + +![block comparison plot](img/blocksizePlot.png) + +### Known limitations +- [FIXED] The Naive implementation fails for array sizes greater than 2^25. + - Naive was calling an inefficient number of threads, leading to higher-than needed `threadIdx.x` + values. When multiplied to get the `index` this overflowed int and yielded a negative index. + Logic around indices (reasonably) assumed positive values and therefore caused an out of bounds write. +- compact scan fails for array sizes greater than 2^28 due to running out of CPU memory on the (16Gb) test machine. + +### Extra Credit +- Work Efficient GPU algorithms are more efficient than CPU (for large array sizes). This was achieved + by removing divergent threads from `upsweep` and `downsweep` kernel calls. Prior to the discussion in class of + modifying the indexing for optimal thread scheduling, these algorithms were implemented to acheive the same end + iteratively. Each layer is a separate kernel call, and each kernel call only spawns one thread per index being + written to (i.e. n/2 for the first call, n/4 for the next, etc.). This does not have the same benefit of contiguous + memory reads, however. Of note, this iterative method does simplify syncing of threads/kernels. No threads on any + given layer are reading/writing from/to any of the indices being read/written from/to by any other thread. Since + All layers are separate kernel calls (on the default stream and therefore with an implicit join/sync), so no + explicit syncs are needed. +- Radix sort has been implemented, though without tiling or bitonic merge. It sorts correctly for all array sizes (power of two + and non-power of two) up until 2^27, at which point the test machine runs out of memory. Radix sorting has been + validated against `Thrust`'s sort (though the timing of the two are different by several orders of magnitude). + The algorithm has not been optimized to use shared memory or contiguous memory reads, and would fail for arrays with + negative values. Here is a plot comparing the timing of the radix sort implementation with `Thrust`s sort + ![sort runtime comparison](img/sortPlot.png) + + + +![block comparison table](img/blockComparison.png) diff --git a/img/blockComparison.png b/img/blockComparison.png new file mode 100644 index 0000000..21cdd0e Binary files /dev/null and b/img/blockComparison.png differ diff --git a/img/blocksizePlot.png b/img/blocksizePlot.png new file mode 100644 index 0000000..5e63d16 Binary files /dev/null and b/img/blocksizePlot.png differ diff --git a/img/compactPlot.png b/img/compactPlot.png new file mode 100644 index 0000000..ee3b5f5 Binary files /dev/null and b/img/compactPlot.png differ diff --git a/img/scanPlot.png b/img/scanPlot.png new file mode 100644 index 0000000..8aeb53e Binary files /dev/null and b/img/scanPlot.png differ diff --git a/img/sortPlot.png b/img/sortPlot.png new file mode 100644 index 0000000..2af6a2a Binary files /dev/null and b/img/sortPlot.png differ diff --git a/img/timingData.png b/img/timingData.png new file mode 100644 index 0000000..bb80142 Binary files /dev/null and b/img/timingData.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..6a65ca2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,10 +10,12 @@ #include #include #include +#include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 26; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -54,11 +56,13 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /* + //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true); + */ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -78,7 +82,6 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -147,8 +150,133 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + // --- repeated timing --- + printf("\n"); + printf("******************************\n"); + printf("** SCAN & COMPACTION TIMING **\n"); + printf("******************************\n"); + + int NUM_TIMINGS = 100; + std::vector data; + float stdDev; + float mean; + + printf(" Data gathered from %i runs with array size %i (2^%i)\n", NUM_TIMINGS, SIZE, ilog2(SIZE)); + printf("--------------------------------------------------------------\n\n"); + printf("------------------------------| mean (ms) |--| stdDev (ms) |--\n"); + printf("------ Scan ------\n"); + + // CPU + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::CPU::scan(SIZE, c, a); + data.push_back(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("CPU Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + // Naive + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::Naive::scan(SIZE, c, a); + data.push_back(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("Naive GPU Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + // work efficient + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::Efficient::scan(SIZE, c, a); + data.push_back(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("Work Efficient GPU Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + // work efficient + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::Thrust::scan(SIZE, c, a); + data.push_back(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("Thrust Library Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + printf("----- Compact -----\n"); + + // CPU + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + data.push_back(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("CPU compact without Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::CPU::compactWithScan(SIZE, b, a); + data.push_back(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("CPU compact with Scan \t%f\t%f\n", mean, stdDev); + data.clear(); + + // work efficient + for (int i = 0; i < NUM_TIMINGS; i++) { + zeroArray(SIZE, c); + StreamCompaction::Efficient::compact(SIZE, c, a); + data.push_back(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation()); + } + tabulate(&data, &mean, &stdDev); + printf("Work Efficient GPU compact\t%f\t%f\n", mean, stdDev); + data.clear(); + + // --- Radix Sort --- + + printf("\n"); + printf("******************************\n"); + printf("********* RADIX SORT *********\n"); + printf("******************************\n"); + + genArray(SIZE, a, 50); + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("thrust sort, power-of-two"); + StreamCompaction::Thrust::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::Radix::sort(SIZE, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(NPOT, b); + printDesc("thrust sort, non-power-of-two"); + StreamCompaction::Thrust::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, b, true); + + zeroArray(NPOT, c); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; } + diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..feef968 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -5,6 +5,7 @@ #include #include #include +#include template int cmpArrays(int n, T *a, T *b) { @@ -74,3 +75,19 @@ void printElapsedTime(T time, std::string note = "") { std::cout << " elapsed time: " << time << "ms " << note << std::endl; } + +void tabulate(std::vector* data, float* mean, float* stdDev) { + float sum = 0; + for (auto i : *data) { + sum += i; + } + *mean = sum / data->size(); + + float variance = 0; + for (auto i : *data) { + variance += (i - *mean) * (i - *mean); + } + variance /= data->size(); + + *stdDev = sqrt(variance); +} diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..eb71175 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..5d3eb74 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + bools[index] = idata[index] != 0; } /** @@ -32,7 +37,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + if (bools[index]) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..67730e7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,12 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +35,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int oi = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[oi] = idata[i]; + oi++; + } + } timer().endCpuTimer(); - return -1; + return oi; } /** @@ -42,9 +53,30 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* shouldInclude = new int[n]; + int* scan = new int[n]; + + for (int i = 0; i < n; i++) { + shouldInclude[i] = (idata[i] != 0) ? 1 : 0; + } + + scan[0] = 0; + for (int i = 1; i < n ; i++) { + scan[i] = shouldInclude[i-1] + scan[i - 1]; + } + + int lastIndex = 0; + for (int i = 0; i < n; i++) { + if (shouldInclude[i] != 0) { + lastIndex = scan[i]; + odata[lastIndex] = idata[i]; + } + } + delete[] shouldInclude; + delete[] scan; timer().endCpuTimer(); - return -1; + return lastIndex+1; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..1759d44 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,10 @@ #include "common.h" #include "efficient.h" +#ifndef THREADS_PER_BLOCK +#define THREADS_PER_BLOCK 512 +#endif // !BLOCKSIZE + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +16,84 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int *odata, int d) { + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + + odata[k + ((1<<(d+1))-1)] = odata[k + (1 << d) - 1] + odata[k + (1 << (d+1)) - 1]; + //__syncthreads(); + } + + __global__ void kernDownSweep(int n, int *odata, int d) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + int t = odata[k + (1 << d) - 1]; + odata[k + (1 << d) - 1] = odata[k + (1 << (d + 1)) - 1]; + odata[k + (1 << (d + 1)) - 1] += t; + //__syncthreads(); + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* dev_readable; + int* dev_odata; + + // pad to a power of 2 + int paddedN = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_odata, paddedN * sizeof(int)); + + // write n items to the GPU array, the total length is `paddedN`, meaning arr[n:paddedN] are 0 + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // The threads/blocks will change per kernel call but declare them here + int numBlocks = ceil( (float)n / THREADS_PER_BLOCK); + int numThreads; + timer().startGpuTimer(); - // TODO + + // --- up sweep --- + for (int d = 0; d < log2(paddedN); d++) { + numThreads = ((paddedN - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernUpSweep <<>> (numThreads, dev_odata, d); + //cudaDeviceSynchronize(); + checkCUDAErrorFn("upsweep failed", "efficent.cu", 50); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + + // --- down sweep --- + // insert 0 at the end of the in-progress output + int ZERO = 0; + cudaMemcpy(dev_odata + paddedN - 1, &ZERO, sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("writing 0 failed", "efficent.cu", 81); + for (int d = log2(paddedN - 1); d >= 0; d--) { + int numThreads = ((paddedN - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernDownSweep <<>> (numThreads, dev_odata, d); + //cudaDeviceSynchronize(); + checkCUDAErrorFn("downsweep failed", "efficent.cu", 70); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + timer().endGpuTimer(); + + // this is an exclusive scan, so the first elem should be 0 + // and we shift everything (except the last elem) one index right + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); } /** @@ -31,10 +106,80 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int paddedN = 1<>>(n, dev_hasElem, dev_idata); + + cudaMemcpy(dev_indices, dev_hasElem, n * sizeof(int), cudaMemcpyDeviceToDevice); + + // --- scan --- + // scanning the `hasElem` "boolean" array yields an array of indices + // idata[i] should be assigned to (if hasElem[i] is truthy). + // why copy (most of the) code from scan() instead of calling it? + // to avoid redundant cudaMalloc/cudaMemcpy calls + + // --- up sweep --- + for (int d = 0; d < log2(paddedN); d++) { + numThreads = ((paddedN - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernUpSweep <<>> (numThreads, dev_indices, d); + checkCUDAErrorFn("upsweep failed", "efficent.cu", 130); + //cudaDeviceSynchronize(); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + + // --- down sweep --- + // insert 0 at the end of the in-progress output + int ZERO = 0; + cudaMemcpy(dev_indices + paddedN - 1, &ZERO, sizeof(int), cudaMemcpyHostToDevice); + for (int d = log2(paddedN - 1); d >= 0; d--) { + numThreads = ((paddedN - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernDownSweep <<>> (numThreads, dev_indices, d); + checkCUDAErrorFn("downsweep failed", "efficent.cu", 151); + //cudaDeviceSynchronize(); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + + // --- scatter --- + // assign idata -> odata based on the indices calculated by the scan + numBlocks = ceil( (float)n / THREADS_PER_BLOCK); + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_hasElem, dev_indices); + checkCUDAErrorFn("scatter failed", "efficent.cu", 165); + timer().endGpuTimer(); - return -1; + + // get the max index for return purposes. This will be whatever is at the end + // of our scattered index array + int maxIndex; + cudaMemcpy(&maxIndex, dev_indices + paddedN - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaMemcpy(odata, dev_odata, maxIndex * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); + cudaFree(dev_idata); + cudaFree(dev_hasElem); + cudaFree(dev_indices); + + return maxIndex; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..1229fa6 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,14 @@ #include "common.h" #include "naive.h" +#ifndef BLOCKSIZE +#define BLOCKSIZE 512 +#endif // !BLOCKSIZE + +#include + + + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +19,67 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void kernNaiveScan(int n, int *odata, const int *idata, int d) { + long index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) { + return; + } + + if (index >= (1 << (d - 1))) { + odata[index] = idata[index - (1 << (d - 1))] + idata[index]; + } + else { + odata[index] = idata[index]; + } + __syncthreads(); + + return; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* dev_readable; + int* dev_writeable; + int* swp; // for ping-ponging odata and odata2 + + int paddedN = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_readable, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_writeable, paddedN * sizeof(int)); + + cudaMemcpy(dev_readable, idata, paddedN * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("memcopy failed", "naive.cu", 51); + timer().startGpuTimer(); - // TODO + // --- begin iterative all-prefix-sum + + for (int d = 1; d <= log2(paddedN); d++) { + + // --- call scan --- + float numBlocks = ceil((float)n / BLOCKSIZE); + kernNaiveScan <<>> (paddedN, dev_writeable, dev_readable, d); + cudaDeviceSynchronize(); + checkCUDAErrorFn("naiveScan failed", "naive.cu", 63); + + // --- ping pong buffers --- + + swp = dev_writeable; + dev_writeable = dev_readable; + dev_readable = swp; + } + timer().endGpuTimer(); + + // this is an exclusive scan, so the first elem should be 0 + // and we shift everything (except the last elem) one index right + odata[0] = 0; + cudaMemcpy(odata+1, dev_readable, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_readable); + cudaFree(dev_writeable); } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..3488b5a --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,197 @@ +#include +#include +#include "common.h" +#include "radix.h" + +#ifndef THREADS_PER_BLOCK +#define THREADS_PER_BLOCK 128 +#endif // !BLOCKSIZE + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernUpSweep(int n, int* odata, int d) { + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + + odata[k + ((1 << (d + 1)) - 1)] = odata[k + (1 << d) - 1] + odata[k + (1 << (d + 1)) - 1]; + } + + __global__ void kernDownSweep(int n, int* odata, int d) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + int t = odata[k + (1 << d) - 1]; + odata[k + (1 << d) - 1] = odata[k + (1 << (d + 1)) - 1]; + odata[k + (1 << (d + 1)) - 1] += t; + } + + __global__ void kernInvertBoolean(int n, int* bools, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + bools[index] = !idata[index]; + } + + __global__ void kernMapBitToBooleans(int n, int bit, int *boolsZero, int *boolsOne, const int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + // adapted from stack overflow answer by Sparky: + // https://stackoverflow.com/questions/8011700/how-do-i-extract-specific-n-bits-of-a-32-bit-unsigned-integer-in-c + int mask = 1 << bit; + int isolatedBit = (idata[index] & mask) >> bit; + + boolsOne[index] = isolatedBit; + boolsZero[index] = !isolatedBit; + } + + __global__ void kernAddConstant(int n, const int incr, int* odata) { + // this was simpler than adding a conditional to up/down sweep, + // had they been implemented differently this could make more sense there + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + odata[index] += incr; + } + + void scanWithOffset(int n, int* dev_odata, int offset = 0) { + int numThreads; + int numBlocks; + + // --- up sweep --- + for (int d = 0; d < log2(n); d++) { + numThreads = ((n - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernUpSweep << > > (numThreads, dev_odata, d); + checkCUDAErrorFn("upsweep failed", "radix.cu", 50); + //cudaDeviceSynchronize(); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + + // --- down sweep --- + // insert 0 at the end of the in-progress output + int ZERO = 0; + cudaMemcpy(dev_odata + n - 1, &ZERO, sizeof(int), cudaMemcpyHostToDevice); + for (int d = log2(n - 1); d >= 0; d--) { + numThreads = ((n - 1) / (1 << (d + 1))) + 1; + numBlocks = ceil((float)numThreads / THREADS_PER_BLOCK); + kernDownSweep << > > (numThreads, dev_odata, d); + checkCUDAErrorFn("downsweep failed", "radix.cu", 65); + //cudaDeviceSynchronize(); + //cudaMemcpy(odata, dev_odata, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + } + + if (offset) { + numBlocks = ceil((float)n / THREADS_PER_BLOCK); + kernAddConstant<<>>(n, offset, dev_odata); + } + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + void sort(int n, int* odata, const int* idata) { + + + int paddedN = 1 << ilog2ceil(n); + int* dev_odata1; + int* dev_odata2; + int* swp; // for ping-ponging odatas + int* dev_bitIsOne; + int* dev_bitIsZero; + int* dev_indicesZero; + int* dev_indicesOne; + + cudaMalloc((void**)&dev_odata1, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_odata2, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_bitIsOne, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_bitIsZero, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_indicesZero, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_indicesOne, paddedN * sizeof(int)); + + cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + // The threads/blocks will change per kernel call but declare them here + int numBlocks = ceil((float)paddedN / THREADS_PER_BLOCK); + int numThreads; + for (int bitNum = 0; bitNum < sizeof(int) * 8; bitNum++) { + + // --- determine radix value --- + // in this case we check the `bitNum`-th bit and store whether it's a one or zero + // in two arrays. why two? so we can scan them both and calculate the proper index + // for each element based on its current radix + kernMapBitToBooleans << > > (paddedN, bitNum, dev_bitIsZero, dev_bitIsOne, dev_odata1); + //cudaDeviceSynchronize(); + //checkCUDAErrorFn("mapToBooleans failed", "radix.cu", 150); + + cudaMemcpy(dev_indicesZero, dev_bitIsZero, paddedN * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_indicesOne, dev_bitIsOne, paddedN * sizeof(int), cudaMemcpyDeviceToDevice); + + // --- scan --- + + scanWithOffset(paddedN, dev_indicesZero); + int lastZeroIndexValue; + int lastZeroIndexBit; + cudaMemcpy(&lastZeroIndexValue, dev_indicesZero + paddedN - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastZeroIndexBit, dev_bitIsZero + paddedN - 1, sizeof(int), cudaMemcpyDeviceToHost); + //cudaDeviceSynchronize(); + //checkCUDAErrorFn("memcopy failed", "radix.cu", 155); + scanWithOffset(paddedN, dev_indicesOne, lastZeroIndexValue + lastZeroIndexBit); + //cudaDeviceSynchronize(); + //checkCUDAErrorFn("scan failed", "radix.cu", 160); + + // --- scatter --- + // assign idata -> odata based on the indices calculated by the scan + + Common::kernScatter << > > (paddedN, dev_odata2, dev_odata1, dev_bitIsZero, dev_indicesZero); + //cudaDeviceSynchronize(); + //checkCUDAErrorFn("scatter failed", "radix.cu", 165); + Common::kernScatter << > > (paddedN, dev_odata2, dev_odata1, dev_bitIsOne, dev_indicesOne); + //cudaDeviceSynchronize(); + //checkCUDAErrorFn("scatter failed", "radix.cu", 165); + + // --- ping pong buffers --- + swp = dev_odata1; + dev_odata1 = dev_odata2; + dev_odata2 = swp; + } + + timer().endGpuTimer(); + + // copy the data back to the host + // offset by (paddedN - n) since any difference between the sizes will + // yield that many zeros at the beginning of the device array + cudaMemcpy(odata, dev_odata1 + (paddedN - n), n * sizeof(int), cudaMemcpyDeviceToHost); + //checkCUDAErrorFn("memcopy failed", "radix.cu", 185); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..aa489d9 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..02f8ce4 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,28 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dev_odata(odata, odata + n); + thrust::device_vector dev_idata(idata, idata + n); + + timer().startGpuTimer(); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + + timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + + } + + void sort(int n, int *odata, const int *idata) { + + std::cout << std::endl; + thrust::copy(idata, idata + n, odata); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::sort(odata, odata+n); + timer().endGpuTimer(); + } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206..864c5f3 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -7,5 +7,6 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void sort(int n, int *odata, const int *idata); } }