Finish basic algorithm; has error when n >= 1 << 13

CIS565-Fall-2020 · EvsChen · Sep 18, 2020 · Sep 19, 2020 · Sep 22, 2020 · Sep 23, 2020
commit ebfd7c3f7da53d3cd2b5a96de89ee22eef4d3bd0
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 12; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -21,7 +21,7 @@ namespace StreamCompaction {
             timer().startCpuTimer();
             odata[0] = 0;
             for (int i = 1; i < n; i++) {
-              odata[i] = odata[i] + idata[n - 1];
+              odata[i] = odata[i - 1] + idata[i - 1];
             }
             timer().endCpuTimer();
         }
@@ -55,7 +55,10 @@ namespace StreamCompaction {
             for (int i = 0; i < n; i++) {
               temp[i] = idata[i] == 0 ? 0 : 1;
             }
-            scan(n, tempSum, temp);
+            tempSum[0] = 0;
+            for (int i = 1; i < n; i++) {
+                tempSum[i] = tempSum[i - 1] + temp[i - 1];
+            }
             int cnt = 0;
             for (int i = 0; i < n; i++) {
               if (temp[i] == 1) {

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -14,16 +14,16 @@ namespace StreamCompaction {
 
         __global__ void upSweep(int *data, int d) {
           int idx = threadIdx.x + (blockIdx.x * blockDim.x);
-          int interval = 2 << d;
+          int interval = 1 << d;
           int mapped = interval * idx + interval - 1;
           data[mapped] += data[mapped - (interval >> 1)];
         }
 
         __global__ void downSweep(int *data, int d) {
           int idx = threadIdx.x + (blockIdx.x * blockDim.x);
-          int interval = 2 << d;
+          int interval = 1 << d;
           int node = interval * idx + interval - 1;
-          int left = node / 2;
+          int left = node - (interval >> 1);
           int temp = data[left];
           data[left] = data[node];
           data[node] += temp;
@@ -32,33 +32,50 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *dev_odata, const int *dev_idata) {
+        void scan(int n, int *dev_odata, const int *dev_idata, bool callFromMain) {
           int iterations = ilog2ceil(n);
-          int nextN = 2 << iterations;
+          int nextN = 1 << iterations;
           int *dev_idata_temp;
           cudaMalloc((void **) &dev_idata_temp, nextN * sizeof(int));
+          checkCUDAError("cudaMalloc dev_idata_temp failed");
           cudaMemset(dev_idata_temp, 0, nextN *sizeof(int));
-          cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
-          timer().startGpuTimer();
+          checkCUDAError("cudaMemset dev_idata_temp failed");
+          if (callFromMain) {
+              cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+              timer().startGpuTimer();
+          }
+          else {
+              cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+          }
+          checkCUDAError("cudaMemcpy dev_idata_temp failed");
 
           // Up-sweep
           for (int d = 1; d <= iterations; d++) {
-            int numThreads = 2 << (iterations - d);
+            int numThreads = 1 << (iterations - d);
             dim3 blocks((numThreads + blockSize - 1) / blockSize);
             upSweep<<<blocks, blockSize>>>(dev_idata_temp, d);
+            checkCUDAError("upSweep failed");
           }
 
           // Down-sweep
           // Set the "root" to 0
-          cudaMemset(dev_idata + n - 1, 0, sizeof(int));
+          cudaMemset(&dev_idata_temp[nextN - 1], 0, sizeof(int));
           for (int d = iterations; d >= 1; d--) {
-            int numThreads = 2 << (iterations - d);
+            int numThreads = 1 << (iterations - d);
             dim3 blocks((numThreads + blockSize - 1) / blockSize);
             downSweep<<<blocks, blockSize>>>(dev_idata_temp, d);
+            checkCUDAError("downSweep failed");
           }
 
-          timer().endGpuTimer();
-          cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+          if (callFromMain) {
+              timer().endGpuTimer();
+              cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToHost);
+          }
+          else {
+              cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+          }
+          checkCUDAError("cudaMemcpy dev_odata failed");
+
           cudaFree(dev_idata_temp);
         }
 
@@ -72,7 +89,7 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-          int *bools, *indices, dev_idata, dev_odata;
+          int *bools, *indices, *dev_idata, *dev_odata;
           cudaMalloc((void**) &bools, sizeof(int) * n);
           cudaMalloc((void**) &indices, sizeof(int) * n);
           cudaMalloc((void**) &dev_idata, sizeof(int) * n);
@@ -83,17 +100,21 @@ namespace StreamCompaction {
 
           dim3 blocks((n + blockSize - 1) / blockSize);
           Common::kernMapToBoolean<<<blocks, blockSize>>>(n, bools, dev_idata);
-          scan(n, indices, bools);
+          scan(n, indices, bools, false);
           Common::kernScatter<<<blocks, blockSize>>>(n, dev_odata, dev_idata, bools, indices);
 
           timer().endGpuTimer();
-
+
+          int cnt, lastBool;
           cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+          // Copy the count back
+          cudaMemcpy(&cnt, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+          cudaMemcpy(&lastBool, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
           cudaFree(bools);
           cudaFree(indices);
           cudaFree(dev_idata);
           cudaFree(dev_odata);
-          return -1;
+          return lastBool ? cnt + 1 : cnt;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool useTimer = true);
 
         int compact(int n, int *odata, const int *idata);
     }

diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -13,9 +13,9 @@ namespace StreamCompaction {
         }
         __global__ void addPrev(int n, int *idata, int *odata, int d) {
           int idx = threadIdx.x + (blockIdx.x * blockDim.x);
-          int base = 2 << (d - 1);
-          if (base + idx >= n) return;
-          odata[base + idx] = idata[base + idx] + idata[idx];
+          if (idx >= n) return;
+          int base = 1 << (d - 1);
+          odata[idx] = idx >= base ? idata[idx - base] + idata[idx] : idata[idx];
         }
 
         /**
@@ -24,23 +24,30 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             int *dev_idata, *dev_odata;
             cudaMalloc((void **) &dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed");
             cudaMalloc((void **) &dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed");
             cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
-            cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy dev_idata failed");
 
             timer().startGpuTimer();
             int iterations = ilog2ceil(n);
 
+            dim3 blocks((n + blockSize - 1) / blockSize);
             for (int d = 1; d <= iterations; d++) {
-              int base = 2 << (d - 1);
-              int numThreads = n - base;
-              dim3 blocks((numThreads + blockSize - 1) / blockSize);
-              addPrev<<<blocks, blockSize>>>(n, dev_idata, dev_odata, d);
-              std::swap(dev_idata, dev_odata);
+                if (d % 2 == 1) {
+                    addPrev << <blocks, blockSize >> > (n, dev_idata, dev_odata, d);
+                }
+                else {
+                    addPrev << <blocks, blockSize >> > (n, dev_odata, dev_idata, d);
+                }
+              checkCUDAError("addPrev failed");
             }
 
             timer().endGpuTimer();
-            cudaMemcpy(odata, (iterations % 2 == 0) ? dev_odata : dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, (iterations % 2 == 1) ? dev_odata : dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy odata failed");
             cudaFree(dev_idata);
             cudaFree(dev_odata);
         }