Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project 2: Shenyue Chen #17

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Finish basic algorithm; has error when n >= 1 << 13
EvsChen committed Sep 19, 2020

Verified

This commit was signed with the committer’s verified signature.
ptoupas Petros Toupas
commit ebfd7c3f7da53d3cd2b5a96de89ee22eef4d3bd0
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 12; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
7 changes: 5 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@ namespace StreamCompaction {
timer().startCpuTimer();
odata[0] = 0;
for (int i = 1; i < n; i++) {
odata[i] = odata[i] + idata[n - 1];
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}
@@ -55,7 +55,10 @@ namespace StreamCompaction {
for (int i = 0; i < n; i++) {
temp[i] = idata[i] == 0 ? 0 : 1;
}
scan(n, tempSum, temp);
tempSum[0] = 0;
for (int i = 1; i < n; i++) {
tempSum[i] = tempSum[i - 1] + temp[i - 1];
}
int cnt = 0;
for (int i = 0; i < n; i++) {
if (temp[i] == 1) {
53 changes: 37 additions & 16 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
@@ -14,16 +14,16 @@ namespace StreamCompaction {

__global__ void upSweep(int *data, int d) {
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
int interval = 2 << d;
int interval = 1 << d;
int mapped = interval * idx + interval - 1;
data[mapped] += data[mapped - (interval >> 1)];
}

__global__ void downSweep(int *data, int d) {
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
int interval = 2 << d;
int interval = 1 << d;
int node = interval * idx + interval - 1;
int left = node / 2;
int left = node - (interval >> 1);
int temp = data[left];
data[left] = data[node];
data[node] += temp;
@@ -32,33 +32,50 @@ namespace StreamCompaction {
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *dev_odata, const int *dev_idata) {
void scan(int n, int *dev_odata, const int *dev_idata, bool callFromMain) {
int iterations = ilog2ceil(n);
int nextN = 2 << iterations;
int nextN = 1 << iterations;
int *dev_idata_temp;
cudaMalloc((void **) &dev_idata_temp, nextN * sizeof(int));
checkCUDAError("cudaMalloc dev_idata_temp failed");
cudaMemset(dev_idata_temp, 0, nextN *sizeof(int));
cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
timer().startGpuTimer();
checkCUDAError("cudaMemset dev_idata_temp failed");
if (callFromMain) {
cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyHostToDevice);
timer().startGpuTimer();
}
else {
cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
}
checkCUDAError("cudaMemcpy dev_idata_temp failed");

// Up-sweep
for (int d = 1; d <= iterations; d++) {
int numThreads = 2 << (iterations - d);
int numThreads = 1 << (iterations - d);
dim3 blocks((numThreads + blockSize - 1) / blockSize);
upSweep<<<blocks, blockSize>>>(dev_idata_temp, d);
checkCUDAError("upSweep failed");
}

// Down-sweep
// Set the "root" to 0
cudaMemset(dev_idata + n - 1, 0, sizeof(int));
cudaMemset(&dev_idata_temp[nextN - 1], 0, sizeof(int));
for (int d = iterations; d >= 1; d--) {
int numThreads = 2 << (iterations - d);
int numThreads = 1 << (iterations - d);
dim3 blocks((numThreads + blockSize - 1) / blockSize);
downSweep<<<blocks, blockSize>>>(dev_idata_temp, d);
checkCUDAError("downSweep failed");
}

timer().endGpuTimer();
cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
if (callFromMain) {
timer().endGpuTimer();
cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToHost);
}
else {
cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
}
checkCUDAError("cudaMemcpy dev_odata failed");

cudaFree(dev_idata_temp);
}

@@ -72,7 +89,7 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
int *bools, *indices, dev_idata, dev_odata;
int *bools, *indices, *dev_idata, *dev_odata;
cudaMalloc((void**) &bools, sizeof(int) * n);
cudaMalloc((void**) &indices, sizeof(int) * n);
cudaMalloc((void**) &dev_idata, sizeof(int) * n);
@@ -83,17 +100,21 @@ namespace StreamCompaction {

dim3 blocks((n + blockSize - 1) / blockSize);
Common::kernMapToBoolean<<<blocks, blockSize>>>(n, bools, dev_idata);
scan(n, indices, bools);
scan(n, indices, bools, false);
Common::kernScatter<<<blocks, blockSize>>>(n, dev_odata, dev_idata, bools, indices);

timer().endGpuTimer();


int cnt, lastBool;
cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
// Copy the count back
cudaMemcpy(&cnt, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&lastBool, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(bools);
cudaFree(indices);
cudaFree(dev_idata);
cudaFree(dev_odata);
return -1;
return lastBool ? cnt + 1 : cnt;
}
}
}
2 changes: 1 addition & 1 deletion stream_compaction/efficient.h
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ namespace StreamCompaction {
namespace Efficient {
StreamCompaction::Common::PerformanceTimer& timer();

void scan(int n, int *odata, const int *idata);
void scan(int n, int *odata, const int *idata, bool useTimer = true);

int compact(int n, int *odata, const int *idata);
}
27 changes: 17 additions & 10 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
@@ -13,9 +13,9 @@ namespace StreamCompaction {
}
__global__ void addPrev(int n, int *idata, int *odata, int d) {
int idx = threadIdx.x + (blockIdx.x * blockDim.x);
int base = 2 << (d - 1);
if (base + idx >= n) return;
odata[base + idx] = idata[base + idx] + idata[idx];
if (idx >= n) return;
int base = 1 << (d - 1);
odata[idx] = idx >= base ? idata[idx - base] + idata[idx] : idata[idx];
}

/**
@@ -24,23 +24,30 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
int *dev_idata, *dev_odata;
cudaMalloc((void **) &dev_idata, n * sizeof(int));
checkCUDAError("cudaMalloc dev_idata failed");
cudaMalloc((void **) &dev_odata, n * sizeof(int));
checkCUDAError("cudaMalloc dev_odata failed");
cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy dev_idata failed");

timer().startGpuTimer();
int iterations = ilog2ceil(n);

dim3 blocks((n + blockSize - 1) / blockSize);
for (int d = 1; d <= iterations; d++) {
int base = 2 << (d - 1);
int numThreads = n - base;
dim3 blocks((numThreads + blockSize - 1) / blockSize);
addPrev<<<blocks, blockSize>>>(n, dev_idata, dev_odata, d);
std::swap(dev_idata, dev_odata);
if (d % 2 == 1) {
addPrev << <blocks, blockSize >> > (n, dev_idata, dev_odata, d);
}
else {
addPrev << <blocks, blockSize >> > (n, dev_odata, dev_idata, d);
}
checkCUDAError("addPrev failed");
}

timer().endGpuTimer();
cudaMemcpy(odata, (iterations % 2 == 0) ? dev_odata : dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
odata[0] = 0;
cudaMemcpy(odata + 1, (iterations % 2 == 1) ? dev_odata : dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy odata failed");
cudaFree(dev_idata);
cudaFree(dev_odata);
}