diff --git a/README.md b/README.md
index 0e38ddb..2f78a0c 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,61 @@
-CUDA Stream Compaction
+Project 2 CUDA Stream Compaction
======================
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Raymond Yang
+ * [LinkedIn](https://www.linkedin.com/in/raymond-yang-b85b19168)
+ * Tested on:
+ * 09/21/2021
+ * Windows 10
+ * NVIDIA GeForce GTX 1080 Ti.
+ * Submitted on: 09/21/2021
-### (TODO: Your README)
+## Introduction
+The objective of this assignment was to implement Stream Compaction. Stream compaction involves three main processes:
+* Boolean Mapping: Given an input array of data `idata`, this data must first be mapped to a boolean array `dev_bool`. This boolean array evaluates whether that input data at each index is desireable. If so, the data will be kept during Scatter. If not, the data will be removed during Scatter.
+* Scan: Given an input boolean array `dev_bool`, scan will output a prefix sum of array values. This output array `dev_dataPadded` should match the format of an exclusive scan. This output array contains the indices of where desirable values in the original input data will be stored during Scatter.
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+
+
+
+* Scatter: Given `dev_bool` and `dev_dataPadded`, Scatter will output a final array `dev_odata` that contains only desirable values.
+
+### CPU Implementation
+`stream_compaction/cpu.cu` implements scan and scatter on the host machine without interacting with the GPU. This implementation follows a simple for loop that iterates through the input data. The output of our CPU implementation becomes the baseline for comparison with respect to time performance and output accuracy of Naive and Work Efficient.
+
+### Naive Implementation
+`stream_compaction/naive.cu` implements a naive approach to scan. This process is "embarassingly parallel" and attempts to perform scan by taking advantage of parallelism. The approach is theoretically `O(log n)` efficient with respect to input data size. Input values at different strides are paired and summed and returned to the input array. This approach is repeated `log n` times. Current implementation does not take advantage of any optimizations to improve performance.
+
+
+
+
+### Work Efficient Implementation
+`stream_compaction/efficient.cu` implements a work-efficient approach to scan and compact. The scan process is broken into two parts:
+* Upsweep:
+
+
+
+* Downsweep:
+
+
+
+
+## Data Analysis
+Benchmark was run on 3 implementations (CPU, Naive, Work-Eff) and the Thrust API. Benchmarks were recorded after 10 successive runs. Benchmarks were recorded in milliseconds. Benchmarks were ran on two data sets. The first data set (PoT = Power of Two) is an array of size 220 populated by random values. The second data set (NPoT = Not Power of Two) is an array of size 220 - 3 populated by random values.
+
+| SCAN | CPU (ms) | Naive (ms) | Work-Eff (ms) | Thrust (ms) |
+|------|----------|------------|---------------|-------------|
+| PoT | 0.613 | 2.026 | 2.884 | 0.272 |
+| NPoT | 0.514 | 2.030 | 2.876 | 0.299 |
+
+
+
+
+
+## Limitation of Current Design
+Implementations failed to take advantage of additional optimizations specified by instructions
+These include:
+* [Optimizing thread and block usage](https://github.com/CIS565-Fall-2021/Project2-Stream-Compaction/blob/main/INSTRUCTION.md#part-5-why-is-my-gpu-approach-so-slow-extra-credit-5).
+* [Utilizing shared memory to drastically reduce memory read and write time](https://github.com/CIS565-Fall-2021/Project2-Stream-Compaction/blob/main/INSTRUCTION.md#part-7-gpu-scan-using-shared-memory--hardware-optimizationextra-credit-10).
\ No newline at end of file
diff --git a/img/ss0.PNG b/img/ss0.PNG
new file mode 100644
index 0000000..85bfa0f
Binary files /dev/null and b/img/ss0.PNG differ
diff --git a/img/ss1.PNG b/img/ss1.PNG
new file mode 100644
index 0000000..bc68011
Binary files /dev/null and b/img/ss1.PNG differ
diff --git a/img/ss2.PNG b/img/ss2.PNG
new file mode 100644
index 0000000..9102fbe
Binary files /dev/null and b/img/ss2.PNG differ
diff --git a/img/ss3.PNG b/img/ss3.PNG
new file mode 100644
index 0000000..c28a2f3
Binary files /dev/null and b/img/ss3.PNG differ
diff --git a/img/ss4.PNG b/img/ss4.PNG
new file mode 100644
index 0000000..fbe1661
Binary files /dev/null and b/img/ss4.PNG differ
diff --git a/img/ss5.PNG b/img/ss5.PNG
new file mode 100644
index 0000000..e991756
Binary files /dev/null and b/img/ss5.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..a1ef996 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,154 +1,211 @@
-/**
- * @file main.cpp
- * @brief Stream compaction test program
- * @authors Kai Ninomiya
- * @date 2015
- * @copyright University of Pennsylvania
- */
-
-#include
-#include
-#include
-#include
-#include
-#include "testing_helpers.hpp"
-
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
-
-int main(int argc, char* argv[]) {
- // Scan tests
-
- printf("\n");
- printf("****************\n");
- printf("** SCAN TESTS **\n");
- printf("****************\n");
-
- genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
- a[SIZE - 1] = 0;
- printArray(SIZE, a, true);
-
- // initialize b using StreamCompaction::CPU::scan you implement
- // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
- // At first all cases passed because b && c are all zeroes.
- zeroArray(SIZE, b);
- printDesc("cpu scan, power-of-two");
- StreamCompaction::CPU::scan(SIZE, b, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(SIZE, b, true);
-
- zeroArray(SIZE, c);
- printDesc("cpu scan, non-power-of-two");
- StreamCompaction::CPU::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(NPOT, b, true);
- printCmpResult(NPOT, b, c);
-
- zeroArray(SIZE, c);
- printDesc("naive scan, power-of-two");
- StreamCompaction::Naive::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
- printCmpResult(SIZE, b, c);
-
- /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
- onesArray(SIZE, c);
- printDesc("1s array for finding bugs");
- StreamCompaction::Naive::scan(SIZE, c, a);
- printArray(SIZE, c, true); */
-
- zeroArray(SIZE, c);
- printDesc("naive scan, non-power-of-two");
- StreamCompaction::Naive::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
- printCmpResult(NPOT, b, c);
-
- zeroArray(SIZE, c);
- printDesc("work-efficient scan, power-of-two");
- StreamCompaction::Efficient::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
- printCmpResult(SIZE, b, c);
-
- zeroArray(SIZE, c);
- printDesc("work-efficient scan, non-power-of-two");
- StreamCompaction::Efficient::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
- printCmpResult(NPOT, b, c);
-
- zeroArray(SIZE, c);
- printDesc("thrust scan, power-of-two");
- StreamCompaction::Thrust::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
- printCmpResult(SIZE, b, c);
-
- zeroArray(SIZE, c);
- printDesc("thrust scan, non-power-of-two");
- StreamCompaction::Thrust::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
- printCmpResult(NPOT, b, c);
-
- printf("\n");
- printf("*****************************\n");
- printf("** STREAM COMPACTION TESTS **\n");
- printf("*****************************\n");
-
- // Compaction tests
-
- genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
- a[SIZE - 1] = 0;
- printArray(SIZE, a, true);
-
- int count, expectedCount, expectedNPOT;
-
- // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
- // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
- zeroArray(SIZE, b);
- printDesc("cpu compact without scan, power-of-two");
- count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- expectedCount = count;
- printArray(count, b, true);
- printCmpLenResult(count, expectedCount, b, b);
-
- zeroArray(SIZE, c);
- printDesc("cpu compact without scan, non-power-of-two");
- count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- expectedNPOT = count;
- printArray(count, c, true);
- printCmpLenResult(count, expectedNPOT, b, c);
-
- zeroArray(SIZE, c);
- printDesc("cpu compact with scan");
- count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(count, c, true);
- printCmpLenResult(count, expectedCount, b, c);
-
- zeroArray(SIZE, c);
- printDesc("work-efficient compact, power-of-two");
- count = StreamCompaction::Efficient::compact(SIZE, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(count, c, true);
- printCmpLenResult(count, expectedCount, b, c);
-
- zeroArray(SIZE, c);
- printDesc("work-efficient compact, non-power-of-two");
- count = StreamCompaction::Efficient::compact(NPOT, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(count, c, true);
- printCmpLenResult(count, expectedNPOT, b, c);
-
- system("pause"); // stop Win32 console from closing on exit
- delete[] a;
- delete[] b;
- delete[] c;
-}
+/**
+ * @file main.cpp
+ * @brief Stream compaction test program
+ * @authors Kai Ninomiya
+ * @date 2015
+ * @copyright University of Pennsylvania
+ */
+
+// 4 - ones
+// 3 - fibonacci 1 - 8
+// 2 - fibonacci 7 - 0
+// 1 - fibonacci 0 - 7
+// 0 - default, randomized
+#define arrTestType 0
+
+#include
+#include
+#include
+#include
+#include
+#include "testing_helpers.hpp"
+
+#if arrTestType
+const int SIZE = 1 << 3; // feel free to change the size of array
+#else
+const int SIZE = 1 << 20; // feel free to change the size of array
+#endif
+
+const int NPOT = SIZE - 3; // Non-Power-Of-Two
+int *a = new int[SIZE];
+int *b = new int[SIZE];
+int *c = new int[SIZE];
+
+int main(int argc, char* argv[]) {
+ // Scan tests
+
+ printf("\n");
+ printf("****************\n");
+ printf("** SCAN TESTS **\n");
+ printf("****************\n\n");
+
+#if arrTestType == 4
+ onesArray(SIZE, a);
+#elif arrTestType == 3
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = i + 1;
+ }
+#elif arrTestType == 2
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = SIZE - i;
+ }
+#elif arrTestType == 1
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = i;
+ }
+#else
+ genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
+ a[SIZE - 1] = 0;
+#endif
+
+ printArray(SIZE, a, true);
+
+ // initialize b using StreamCompaction::CPU::scan you implement
+ // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+ // At first all cases passed because b && c are all zeroes.
+ zeroArray(SIZE, b);
+ printDesc("cpu scan, power-of-two");
+ StreamCompaction::CPU::scan(SIZE, b, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ //printArray(SIZE, b, true);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu scan, non-power-of-two");
+ StreamCompaction::CPU::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ //printArray(NPOT, b, true);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("naive scan, power-of-two");
+
+ /*printDesc("Array a = ");
+ printArray(SIZE, a, false);
+ printDesc("Array b = ");
+ printArray(SIZE, b, false);
+ printDesc("Array c = ");
+ printArray(SIZE, c, false);*/
+
+ StreamCompaction::Naive::scan(SIZE, c, a);
+
+ /*printDesc("Final c = ");
+ printArray(SIZE, c, false); */
+
+ printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+ onesArray(SIZE, c);
+ printDesc("1s array for finding bugs");
+ StreamCompaction::Naive::scan(SIZE, c, a);
+ printArray(SIZE, c, true); */
+
+ zeroArray(SIZE, c);
+ printDesc("naive scan, non-power-of-two");
+ StreamCompaction::Naive::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient scan, power-of-two");
+ StreamCompaction::Efficient::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, false);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient scan, non-power-of-two");
+ StreamCompaction::Efficient::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(NPOT, c, false);
+ printCmpResult(NPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("thrust scan, power-of-two");
+ StreamCompaction::Thrust::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("thrust scan, non-power-of-two");
+ StreamCompaction::Thrust::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(NPOT, c, true);
+ printCmpResult(NPOT, b, c);
+
+ printf("\n");
+ printf("*****************************\n");
+ printf("** STREAM COMPACTION TESTS **\n");
+ printf("*****************************\n");
+
+ // Compaction tests
+
+#if arrTestType == 4
+ onesArray(SIZE, a);
+#elif arrTestType == 3
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = i + 1;
+ }
+#elif arrTestType == 2
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = SIZE - i;
+ }
+#elif arrTestType == 1
+ for (int i = 0; i < SIZE; i++) {
+ a[i] = i;
+ }
+#else
+ genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
+ a[SIZE - 1] = 0;
+#endif
+ printArray(SIZE, a, true);
+
+ int count, expectedCount, expectedNPOT;
+
+ // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+ // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+ zeroArray(SIZE, b);
+ printDesc("cpu compact without scan, power-of-two");
+ count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ expectedCount = count;
+ //printArray(count, b, true);
+ printCmpLenResult(count, expectedCount, b, b);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu compact without scan, non-power-of-two");
+ count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ expectedNPOT = count;
+ //printArray(count, c, false);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("cpu compact with scan");
+ count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ //printArray(count, c, false);
+ printCmpLenResult(count, expectedCount, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient compact, power-of-two");
+ count = StreamCompaction::Efficient::compact(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(count, c, false);
+ printCmpLenResult(count, expectedCount, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("work-efficient compact, non-power-of-two");
+ count = StreamCompaction::Efficient::compact(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(count, c, false);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
+ system("pause"); // stop Win32 console from closing on exit
+ delete[] a;
+ delete[] b;
+ delete[] c;
+}
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..689f78a 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -22,8 +22,16 @@ namespace StreamCompaction {
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
- __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+ __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata) {
// TODO
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= n) {
+ return;
+ }
+
+ int result = idata[index] != 0;
+ bools1[index] = result;
+ bools2[index] = result;
}
/**
@@ -33,7 +41,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
- }
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= n) {
+ return;
+ }
+ if (bools[index]) {
+ odata[indices[index]] = idata[index];
+ }
+ }
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..52a0d0b 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -13,6 +13,9 @@
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 128
+#define useCommon 1
+
/**
* Check for CUDA errors; print and exit if there was a problem.
*/
@@ -32,7 +35,7 @@ inline int ilog2ceil(int x) {
namespace StreamCompaction {
namespace Common {
- __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+ __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata);
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..1520007 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,12 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
+ // tally a running sum of input data
+ int sum = 0;
+ for (int i = 0; i < n; i++) {
+ odata[i] = sum;
+ sum += idata[i];
+ }
timer().endCpuTimer();
}
@@ -31,8 +37,15 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
+ // if condition is met, scan and scatter at the same time (sort of)
+ int index = 0;
+ for (int i = 0; i < n; i++) {
+ if (idata[i]) {
+ odata[index++] = idata[i];
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return index;
}
/**
@@ -43,8 +56,38 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
+
+ if (n < 1) { return 0; }
+
+ // boolean buffer
+ int* ibool = (int*) malloc(n * sizeof(int));
+
+ // map input array to boolean
+ for (int i = 0; i < n; i++) {
+ ibool[i] = idata[i] != 0;
+ }
+
+ // scan boolean buffer
+ // memory error is thrown when calling StreamCompaction::CPU::scan()
+ int iboolScan = 0;
+ for (int i = 0; i < n; i++) {
+ odata[i] = iboolScan;
+ iboolScan += ibool[i];
+ }
+
+ int numElements = odata[n - 1] + ibool[n - 1];
+
+ // scatter
+ for (int i = 0; i < n; i++) {
+ if (ibool[i]) {
+ odata[odata[i]] = idata[i];
+ }
+ }
+
+ free(ibool);
+
timer().endCpuTimer();
- return -1;
+ return numElements;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..69b35fd 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,96 @@ namespace StreamCompaction {
return timer;
}
+ __global__ void kernUpSweepIter(int nPadded, int depth, int* dataPadded) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= nPadded) {
+ return;
+ }
+
+ int offset = 1 << (depth + 1);
+
+ if (index % offset == 0) {
+ dataPadded[index + offset - 1] += dataPadded[index + (offset >> 1) - 1];
+ }
+ }
+
+ __global__ void kernDownSweepIter(int nPadded, int depth, int* dataPadded) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= nPadded) {
+ return;
+ }
+
+ int offset = 1 << (depth + 1);
+ if (index % offset == 0) {
+ int temp = dataPadded[index + (offset >> 1) - 1];
+ dataPadded[index + (offset >> 1) - 1] = dataPadded[index + offset - 1];
+ dataPadded[index + offset - 1] += temp;
+ }
+ }
+
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+
+ if (n < 1) { return; }
+
+ // allocate a buffer padded to a power of 2.
+ int depth = ilog2ceil(n);
+ int nPadded = 1 << depth;
+
+ int* dev_dataPadded;
+ cudaMalloc((void**)&dev_dataPadded, nPadded * sizeof(int));
+ checkCUDAError("cudaMalloc dev_dataPadded failed!");
+
+ // set blocks and threads
+ dim3 threadsPerBlock(blockSize);
+ dim3 fullBlocksPerGrid(std::ceil((double)nPadded / blockSize));
+
+ // copy idata to device memory
+ cudaMemset(dev_dataPadded, 0, nPadded * sizeof(int));
+ checkCUDAError("cudaMemset dev_dataPadded failed!");
+ cudaMemcpy(dev_dataPadded, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy dev_dataPadded failed!");
+
timer().startGpuTimer();
// TODO
+ // perform upsweep on idata
+ for (int i = 0; i < depth; i++) {
+ kernUpSweepIter<<>>(nPadded, i, dev_dataPadded);
+ checkCUDAError("kernUpSweepIter failed!");
+ }
+
+ // perform downsweep on idata
+ cudaMemset(dev_dataPadded + nPadded - 1, 0, sizeof(int));
+ checkCUDAError("cudaMemset dev_dataPadded + nPadded - 1 failed!");
+ for (int i = depth - 1; i >= 0; i--) {
+ kernDownSweepIter<<>>(nPadded, i, dev_dataPadded);
+ checkCUDAError("kernDownSweepIter failed!");
+ }
+
+ cudaDeviceSynchronize();
+ checkCUDAError("cudaDeviceSynchronize failed!");
timer().endGpuTimer();
+
+ // copy scan back to host
+ cudaMemcpy(odata, dev_dataPadded, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy dev_dataPadded failed!");
+
+ cudaFree(dev_dataPadded);
+ checkCUDAError("cudaFree dev_dataPadded failed!");
+ }
+
+ __global__ void kernScatter(int nPadded, const int* idata, int* odata, const int* dataPadded) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= nPadded) {
+ return;
+ }
+
+ if (idata[index]) {
+ odata[dataPadded[index]] = idata[index];
+ }
}
/**
@@ -31,10 +114,95 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
// TODO
+
+ if (n < 1) { return -1; }
+
+ // allocate a buffer padded to a power of 2.
+ int depth = ilog2ceil(n);
+ int nPadded = 1 << depth;
+
+ // calling kernels means we cannot directly index into idata. Need to have a device copy
+ int* dev_dataPadded;
+ cudaMalloc((void**)&dev_dataPadded, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_dataPadded failed!");
+ cudaMemset(dev_dataPadded, 0, n * sizeof(int));
+ checkCUDAError("cudaMemset dev_dataPadded failed!");
+ cudaMemcpy(dev_dataPadded, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy dev_dataPadded failed!");
+
+ // mapping of true and false for idata
+ int* dev_bools;
+ cudaMalloc((void**)&dev_bools, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_bools failed!");
+
+ // array that will be scanned into
+ int* dev_index;
+ cudaMalloc((void**)&dev_index, nPadded * sizeof(int));
+ checkCUDAError("cudaMalloc dev_index failed!");
+ cudaMemset(dev_index, 0, nPadded * sizeof(int));
+ checkCUDAError("cudaMemset dev_index failed!");
+
+ int* dev_odata;
+ cudaMalloc((void**)&dev_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_odata failed!");
+
+ // set blocks and threads
+ dim3 threadsPerBlock(blockSize);
+ dim3 fullBlocksPerGrid(std::ceil((double) nPadded / blockSize));
+
+ timer().startGpuTimer();
+
+ // SCAN
+ StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_index, dev_dataPadded);
+ checkCUDAError("kernMapToBoolean failed!");
+
+
+
+ // perform upsweep on idata
+ for (int i = 0; i < depth; i++) {
+ kernUpSweepIter << > > (nPadded, i, dev_index);
+ checkCUDAError("kernUpSweepIter failed!");
+ }
+
+ // perform downsweep on idata
+ cudaMemset(dev_index + nPadded - 1, 0, sizeof(int));
+ checkCUDAError("cudaMemset dev_dataPadded + nPadded - 1 failed!");
+ for (int i = depth - 1; i >= 0; i--) {
+ kernDownSweepIter<<>>(nPadded, i, dev_index);
+ checkCUDAError("kernDownSweepIter failed!");
+ }
+
+ // SCATTER
+ StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_dataPadded, dev_bools, dev_index);
+ checkCUDAError("kernScatter failed!");
+ cudaDeviceSynchronize();
+ checkCUDAError("cudaDeviceSynchronize failed!");
+
timer().endGpuTimer();
- return -1;
+
+ // return compact to odata
+ cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy dev_bools failed!");
+
+ // return final index and bool to host to calculate number of elements
+ int idx, val;
+ cudaMemcpy((void*)&idx, dev_index + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy idx failed!");
+ cudaMemcpy((void*)&val, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy val failed!");
+
+ // free
+ cudaFree(dev_dataPadded);
+ checkCUDAError("cudaFree dev_dataPadded failed!");
+ cudaFree(dev_bools);
+ checkCUDAError("cudaFree dev_bools failed!");
+ cudaFree(dev_index);
+ checkCUDAError("cudaFree dev_index failed!");
+ cudaFree(dev_odata);
+ checkCUDAError("cudaFree dev_odata failed!");
+
+ return idx + val;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..f4922ee 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -13,13 +13,82 @@ namespace StreamCompaction {
}
// TODO: __global__
+ __global__ void kernExclusive(int nPadded, const int* dataPadded1, int* dataPadded2) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= nPadded) {
+ return;
+ }
+
+ // if first element, pad with identity. Otherwise copy left element
+ dataPadded2[index] = (index) ? dataPadded1[index - 1] : 0;
+ }
+
+ __global__ void kernScanNaive(int nPadded, int depth, const int* dataPadded1, int* dataPadded2) {
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (index >= nPadded) {
+ return;
+ }
+
+ // copy old values that won't be computed
+ if (index < depth) {
+ dataPadded2[index] = dataPadded1[index];
+ return;
+ }
+
+ // compute new values
+ dataPadded2[index] = dataPadded1[index - depth] + dataPadded1[index];
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
// TODO
+ if (n < 1) { return; }
+
+ // allocate two buffers padded to a power of 2.
+ int depth = ilog2ceil(n);
+ int nPadded = 1 << depth;
+
+ int* dev_dataPadded1; int* dev_dataPadded2;
+ cudaMalloc((void**)&dev_dataPadded1, nPadded * sizeof(int));
+ checkCUDAError("cudaMalloc dev_dataExtended1 failed!");
+ cudaMalloc((void**)&dev_dataPadded2, nPadded * sizeof(int));
+ checkCUDAError("cudaMalloc dev_dataExtended2 failed!");
+
+ // set blocks and threads
+ dim3 threadsPerBlock(blockSize);
+ dim3 fullBlocksPerGrid(std::ceil((double) nPadded / blockSize));
+
+ // copy idata to device memory
+ cudaMemset(dev_dataPadded1, 0, nPadded * sizeof(int));
+ checkCUDAError("cudaMemset dev_dataPadded1 failed!");
+ cudaMemcpy(dev_dataPadded1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy dev_dataPadded1 failed!");
+
+ // begin scan process
+ timer().startGpuTimer();
+ for (int i = 1; i < nPadded; i <<= 1) {
+ // perform partial scan on depth i
+ kernScanNaive<<>>(nPadded, i, dev_dataPadded1, dev_dataPadded2);
+ // swap to avoid race conditions
+ std::swap(dev_dataPadded1, dev_dataPadded2);
+ }
+
+ // make scan exclusive
+ kernExclusive<<>>(nPadded, dev_dataPadded1, dev_dataPadded2);
+ cudaDeviceSynchronize();
+ checkCUDAError("cudaDeviceSynchronize failed!");
timer().endGpuTimer();
+
+ // copy scan back to host
+ cudaMemcpy(odata, dev_dataPadded2, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy dev_dataPadded2 failed!");
+
+ // free local buffers
+ cudaFree(dev_dataPadded1);
+ checkCUDAError("cudaFree dev_dataPadded1 failed!");
+ cudaFree(dev_dataPadded2);
+ checkCUDAError("cudaFree dev_dataPadded2 failed!");
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..143851c 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,18 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+
+ thrust::host_vector h_thrust_vec(idata, idata + n);
+ thrust::device_vector dev_thrust_vec(h_thrust_vec);
+ thrust::device_vector dv_out(n);
+
timer().startGpuTimer();
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+ thrust::exclusive_scan(dev_thrust_vec.begin(), dev_thrust_vec.end(), dv_out.begin());
timer().endGpuTimer();
+ thrust::copy(dv_out.begin(), dv_out.end(), odata);
}
}
}