diff --git a/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png b/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png new file mode 100644 index 0000000..e928ab6 Binary files /dev/null and b/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png differ diff --git a/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png b/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png new file mode 100644 index 0000000..af6cab5 Binary files /dev/null and b/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png differ diff --git a/src/interactions.h b/src/interactions.h index 5ce3628..c8ff1a7 100644 --- a/src/interactions.h +++ b/src/interactions.h @@ -1,6 +1,8 @@ #pragma once #include "intersections.h" +#include "glm/glm.hpp" +#include "glm/gtx/norm.hpp" // CHECKITOUT /** @@ -9,7 +11,7 @@ */ __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere( - glm::vec3 normal, thrust::default_random_engine &rng) { + glm::vec3 normal, thrust::default_random_engine& rng) { thrust::uniform_real_distribution u01(0, 1); float up = sqrt(u01(rng)); // cos(theta) @@ -24,9 +26,11 @@ glm::vec3 calculateRandomDirectionInHemisphere( glm::vec3 directionNotNormal; if (abs(normal.x) < SQRT_OF_ONE_THIRD) { directionNotNormal = glm::vec3(1, 0, 0); - } else if (abs(normal.y) < SQRT_OF_ONE_THIRD) { + } + else if (abs(normal.y) < SQRT_OF_ONE_THIRD) { directionNotNormal = glm::vec3(0, 1, 0); - } else { + } + else { directionNotNormal = glm::vec3(0, 0, 1); } @@ -47,11 +51,11 @@ glm::vec3 calculateRandomDirectionInHemisphere( * A perfect specular surface scatters in the reflected ray direction. * In order to apply multiple effects to one surface, probabilistically choose * between them. - * + * * The visual effect you want is to straight-up add the diffuse and specular * components. You can do this in a few ways. This logic also applies to * combining other types of materias (such as refractive). - * + * * - Always take an even (50/50) split between a each effect (a diffuse bounce * and a specular bounce), but divide the resulting color of either branch * by its probability (0.5), to counteract the chance (0.5) of the branch @@ -62,18 +66,32 @@ glm::vec3 calculateRandomDirectionInHemisphere( * branch result by that branch's probability (whatever probability you use). * * This method applies its changes to the Ray parameter `ray` in place. - * It also modifies the color `color` of the ray in place. + * It also modifies the color `col or` of the ray in place. * * You may need to change the parameter list for your purposes! */ __host__ __device__ void scatterRay( - PathSegment & pathSegment, - glm::vec3 intersect, - glm::vec3 normal, - const Material &m, - thrust::default_random_engine &rng) { + PathSegment& pathSegment, + glm::vec3 intersect, + glm::vec3 normal, + const Material& m, + thrust::default_random_engine& rng) { // TODO: implement this. // A basic implementation of pure-diffuse shading will just call the // calculateRandomDirectionInHemisphere defined above. + + //specular + if (m.hasReflective > 0) { + pathSegment.color *= m.specular.color; + pathSegment.ray.origin = intersect; + pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal); + } + //diffuse + else { + glm::vec3 rand_dir = calculateRandomDirectionInHemisphere(normal, rng); + pathSegment.color *= m.color; + pathSegment.ray.origin = intersect; + pathSegment.ray.direction = rand_dir; + } } diff --git a/src/pathtrace.cu b/src/pathtrace.cu index c1ec122..df7761b 100644 --- a/src/pathtrace.cu +++ b/src/pathtrace.cu @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include "sceneStructs.h" #include "scene.h" @@ -18,7 +20,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -void checkCUDAErrorFn(const char *msg, const char *file, int line) { +void checkCUDAErrorFn(const char* msg, const char* file, int line) { #if ERRORCHECK cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); @@ -46,7 +48,7 @@ thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int de //Kernel that writes the image to the OpenGL PBO directly. __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, - int iter, glm::vec3* image) { + int iter, glm::vec3* image) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -55,9 +57,9 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, glm::vec3 pix = image[index]; glm::ivec3 color; - color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255); - color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255); - color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255); + color.x = glm::clamp((int)(pix.x / iter * 255.0), 0, 255); + color.y = glm::clamp((int)(pix.y / iter * 255.0), 0, 255); + color.z = glm::clamp((int)(pix.z / iter * 255.0), 0, 255); // Each thread writes one pixel location in the texture (textel) pbo[index].w = 0; @@ -67,33 +69,33 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, } } -static Scene * hst_scene = NULL; -static glm::vec3 * dev_image = NULL; -static Geom * dev_geoms = NULL; -static Material * dev_materials = NULL; -static PathSegment * dev_paths = NULL; -static ShadeableIntersection * dev_intersections = NULL; +static Scene* hst_scene = NULL; +static glm::vec3* dev_image = NULL; +static Geom* dev_geoms = NULL; +static Material* dev_materials = NULL; +static PathSegment* dev_paths = NULL; +static ShadeableIntersection* dev_intersections = NULL; // TODO: static variables for device memory, any extra info you need, etc // ... -void pathtraceInit(Scene *scene) { +void pathtraceInit(Scene* scene) { hst_scene = scene; - const Camera &cam = hst_scene->state.camera; + const Camera& cam = hst_scene->state.camera; const int pixelcount = cam.resolution.x * cam.resolution.y; cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3)); cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3)); - cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment)); + cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment)); - cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom)); - cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice); + cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom)); + cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice); - cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material)); - cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice); + cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material)); + cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice); - cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection)); - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); + cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection)); + cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); // TODO: initialize any extra device memeory you need @@ -102,10 +104,10 @@ void pathtraceInit(Scene *scene) { void pathtraceFree() { cudaFree(dev_image); // no-op if dev_image is null - cudaFree(dev_paths); - cudaFree(dev_geoms); - cudaFree(dev_materials); - cudaFree(dev_intersections); + cudaFree(dev_paths); + cudaFree(dev_geoms); + cudaFree(dev_materials); + cudaFree(dev_intersections); // TODO: clean up any extra device memory you created checkCUDAError("pathtraceFree"); @@ -121,25 +123,25 @@ void pathtraceFree() { */ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; - if (x < cam.resolution.x && y < cam.resolution.y) { - int index = x + (y * cam.resolution.x); - PathSegment & segment = pathSegments[index]; + if (x < cam.resolution.x && y < cam.resolution.y) { + int index = x + (y * cam.resolution.x); + PathSegment& segment = pathSegments[index]; - segment.ray.origin = cam.position; - segment.color = glm::vec3(1.0f, 1.0f, 1.0f); + segment.ray.origin = cam.position; + segment.color = glm::vec3(1.0f, 1.0f, 1.0f); - // TODO: implement antialiasing by jittering the ray - segment.ray.direction = glm::normalize(cam.view - - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f) - - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f) - ); + // TODO: implement antialiasing by jittering the ray + segment.ray.direction = glm::normalize(cam.view + - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f) + - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f) + ); - segment.pixelIndex = index; - segment.remainingBounces = traceDepth; - } + segment.pixelIndex = index; + segment.remainingBounces = traceDepth; + } } // TODO: @@ -147,153 +149,148 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path // Generating new rays is handled in your shader(s). // Feel free to modify the code below. __global__ void computeIntersections( - int depth - , int num_paths - , PathSegment * pathSegments - , Geom * geoms - , int geoms_size - , ShadeableIntersection * intersections - ) + int depth + , int num_paths + , PathSegment* pathSegments + , Geom* geoms + , int geoms_size + , ShadeableIntersection* intersections +) { - int path_index = blockIdx.x * blockDim.x + threadIdx.x; - - if (path_index < num_paths) - { - PathSegment pathSegment = pathSegments[path_index]; - - float t; - glm::vec3 intersect_point; - glm::vec3 normal; - float t_min = FLT_MAX; - int hit_geom_index = -1; - bool outside = true; - - glm::vec3 tmp_intersect; - glm::vec3 tmp_normal; - - // naive parse through global geoms - - for (int i = 0; i < geoms_size; i++) - { - Geom & geom = geoms[i]; - - if (geom.type == CUBE) - { - t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - else if (geom.type == SPHERE) - { - t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - // TODO: add more intersection tests here... triangle? metaball? CSG? - - // Compute the minimum t from the intersection tests to determine what - // scene geometry object was hit first. - if (t > 0.0f && t_min > t) - { - t_min = t; - hit_geom_index = i; - intersect_point = tmp_intersect; - normal = tmp_normal; - } - } - - if (hit_geom_index == -1) - { - intersections[path_index].t = -1.0f; - } - else - { - //The ray hits something - intersections[path_index].t = t_min; - intersections[path_index].materialId = geoms[hit_geom_index].materialid; - intersections[path_index].surfaceNormal = normal; - } - } + int path_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (path_index < num_paths) + { + PathSegment pathSegment = pathSegments[path_index]; + + float t; + glm::vec3 intersect_point; + glm::vec3 normal; + float t_min = FLT_MAX; + int hit_geom_index = -1; + bool outside = true; + + glm::vec3 tmp_intersect; + glm::vec3 tmp_normal; + + // naive parse through global geoms + + for (int i = 0; i < geoms_size; i++) + { + Geom& geom = geoms[i]; + + if (geom.type == CUBE) + { + t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); + } + else if (geom.type == SPHERE) + { + t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); + } + // TODO: add more intersection tests here... triangle? metaball? CSG? + + // Compute the minimum t from the intersection tests to determine what + // scene geometry object was hit first. + if (t > 0.0f && t_min > t) + { + t_min = t; + hit_geom_index = i; + intersect_point = tmp_intersect; + normal = tmp_normal; + } + } + + if (hit_geom_index == -1) + { + intersections[path_index].t = -1.0f; + } + else + { + //The ray hits something + intersections[path_index].t = t_min; + intersections[path_index].materialId = geoms[hit_geom_index].materialid; + intersections[path_index].surfaceNormal = normal; + } + } } -// LOOK: "fake" shader demonstrating what you might do with the info in -// a ShadeableIntersection, as well as how to use thrust's random number -// generator. Observe that since the thrust random number generator basically -// adds "noise" to the iteration, the image should start off noisy and get -// cleaner as more iterations are computed. -// -// Note that this shader does NOT do a BSDF evaluation! -// Your shaders should handle that - this can allow techniques such as -// bump mapping. -__global__ void shadeFakeMaterial ( - int iter - , int num_paths - , ShadeableIntersection * shadeableIntersections - , PathSegment * pathSegments - , Material * materials - ) +__global__ void shadeMaterial( + int iter + , int num_paths + , ShadeableIntersection* shadeableIntersections + , PathSegment* pathSegments + , Material* materials +) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_paths) - { - ShadeableIntersection intersection = shadeableIntersections[idx]; - if (intersection.t > 0.0f) { // if the intersection exists... - // Set up the RNG - // LOOK: this is how you use thrust's RNG! Please look at - // makeSeededRandomEngine as well. - thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0); - thrust::uniform_real_distribution u01(0, 1); - - Material material = materials[intersection.materialId]; - glm::vec3 materialColor = material.color; - - // If the material indicates that the object was a light, "light" the ray - if (material.emittance > 0.0f) { - pathSegments[idx].color *= (materialColor * material.emittance); - } - // Otherwise, do some pseudo-lighting computation. This is actually more - // like what you would expect from shading in a rasterizer like OpenGL. - // TODO: replace this! you should be able to start with basically a one-liner - else { - float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f)); - pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f; - pathSegments[idx].color *= u01(rng); // apply some noise because why not - } - // If there was no intersection, color the ray black. - // Lots of renderers use 4 channel color, RGBA, where A = alpha, often - // used for opacity, in which case they can indicate "no opacity". - // This can be useful for post-processing and image compositing. - } else { - pathSegments[idx].color = glm::vec3(0.0f); + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_paths) + { + ShadeableIntersection intersection = shadeableIntersections[idx]; + if (intersection.t > 0.0f) { // if the intersection exists... + thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0); + thrust::uniform_real_distribution u01(0, 1); + + Material material = materials[intersection.materialId]; + glm::vec3 materialColor = material.color; + + // If the material indicates that the object was a light, "light" the ray + if (material.emittance > 0.0f) { + pathSegments[idx].color *= (materialColor * material.emittance); + pathSegments[idx].remainingBounces = -1; + } + else { + scatterRay(pathSegments[idx], pathSegments[idx].ray.origin + pathSegments[idx].ray.direction * intersection.t, + intersection.surfaceNormal, material, rng); + pathSegments[idx].remainingBounces -= 1; + } + } + else { + pathSegments[idx].color = glm::vec3(0.0f); + pathSegments[idx].remainingBounces = -1; + } + } - } } // Add the current iteration's output to the overall image -__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths) +__global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < nPaths) - { - PathSegment iterationPath = iterationPaths[index]; - image[iterationPath.pixelIndex] += iterationPath.color; - } + if (index < nPaths) + { + PathSegment iterationPath = iterationPaths[index]; + image[iterationPath.pixelIndex] += iterationPath.color; + } } +struct should_terminate +{ + __host__ __device__ + bool operator()(const PathSegment& pathSegment) + { + return (pathSegment.remainingBounces > 0); + } +}; + /** * Wrapper for the __global__ call that sets up the kernel calls and does a ton * of memory management */ -void pathtrace(uchar4 *pbo, int frame, int iter) { +void pathtrace(uchar4* pbo, int frame, int iter) { const int traceDepth = hst_scene->state.traceDepth; - const Camera &cam = hst_scene->state.camera; + //const int traceDepth = 200; + const Camera& cam = hst_scene->state.camera; const int pixelcount = cam.resolution.x * cam.resolution.y; - // 2D block for generating ray from camera + // 2D block for generating ray from camera const dim3 blockSize2d(8, 8); const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); - // 1D block for path tracing - const int blockSize1d = 128; + // 1D block for path tracing + const int blockSize1d = 128; /////////////////////////////////////////////////////////////////////////// @@ -326,68 +323,69 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { // TODO: perform one iteration of path tracing - generateRayFromCamera <<>>(cam, iter, traceDepth, dev_paths); - checkCUDAError("generate camera ray"); - - int depth = 0; - PathSegment* dev_path_end = dev_paths + pixelcount; - int num_paths = dev_path_end - dev_paths; - - // --- PathSegment Tracing Stage --- - // Shoot ray into scene, bounce between objects, push shading chunks - - bool iterationComplete = false; - while (!iterationComplete) { - - // clean shading chunks - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); - - // tracing - dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; - computeIntersections <<>> ( - depth - , num_paths - , dev_paths - , dev_geoms - , hst_scene->geoms.size() - , dev_intersections - ); - checkCUDAError("trace one bounce"); - cudaDeviceSynchronize(); - depth++; - - - // TODO: - // --- Shading Stage --- - // Shade path segments based on intersections and generate new rays by - // evaluating the BSDF. - // Start off with just a big kernel that handles all the different - // materials you have in the scenefile. - // TODO: compare between directly shading the path segments and shading - // path segments that have been reshuffled to be contiguous in memory. - - shadeFakeMaterial<<>> ( - iter, - num_paths, - dev_intersections, - dev_paths, - dev_materials - ); - iterationComplete = true; // TODO: should be based off stream compaction results. - } - - // Assemble this iteration and apply it to the image - dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d; - finalGather<<>>(num_paths, dev_image, dev_paths); + generateRayFromCamera << > > (cam, iter, traceDepth, dev_paths); + checkCUDAError("generate camera ray"); + + int depth = 0; + PathSegment* dev_path_end = dev_paths + pixelcount; + int num_paths = dev_path_end - dev_paths; + int num_paths_start = num_paths; + + // --- PathSegment Tracing Stage --- + // Shoot ray into scene, bounce between objects, push shading chunks + + bool iterationComplete = false; + while (!iterationComplete) { + + // clean shading chunks + cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); + + // tracing + dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; + computeIntersections << > > (depth, num_paths, dev_paths, dev_geoms + , hst_scene->geoms.size(), dev_intersections); + checkCUDAError("trace one bounce"); + cudaDeviceSynchronize(); + depth++; + + + // TODO: + // --- Shading Stage --- + // Shade path segments based on intersections and generate new rays by + // evaluating the BSDF. + // Start off with just a big kernel that handles all the different + // materials you have in the scenefile. + // TODO: compare between directly shading the path segments and shading + // path segments that have been reshuffled to be contiguous in memory. + + shadeMaterial << > > ( + iter, + num_paths, + dev_intersections, + dev_paths, + dev_materials + ); + + PathSegment* dev_path_end = thrust::stable_partition(thrust::device, dev_paths, dev_paths + num_paths, should_terminate()); + num_paths = dev_path_end - dev_paths; + + if (num_paths == 0) { + iterationComplete = true; + } + } + + // Assemble this iteration and apply it to the image + dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d; + finalGather << > > (num_paths_start, dev_image, dev_paths); /////////////////////////////////////////////////////////////////////////// // Send results to OpenGL buffer for rendering - sendImageToPBO<<>>(pbo, cam.resolution, iter, dev_image); + sendImageToPBO << > > (pbo, cam.resolution, iter, dev_image); // Retrieve image from GPU cudaMemcpy(hst_scene->state.image.data(), dev_image, - pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); + pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); checkCUDAError("pathtrace"); } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 4538f04..567795b 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,6 +1,23 @@ -set(SOURCE_FILES +set(headers + "common.h" + "cpu.h" + "naive.h" + "efficient.h" + "thrust.h" ) -cuda_add_library(stream_compaction - ${SOURCE_FILES} +set(sources + "common.cu" + "cpu.cu" + "naive.cu" + "efficient.cu" + "thrust.cu" ) + +list(SORT headers) +list(SORT sources) + +source_group(Headers FILES ${headers}) +source_group(Sources FILES ${sources}) + +cuda_add_library(stream_compaction ${sources} ${headers}) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu new file mode 100644 index 0000000..e77300b --- /dev/null +++ b/stream_compaction/common.cu @@ -0,0 +1,51 @@ +#include "common.h" + +void checkCUDAErrorFn(const char* msg, const char* file, int line) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); +} + + +namespace StreamCompaction { + namespace Common { + + /** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ + __global__ void kernMapToBoolean(int n, int* bools, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; + } + + /** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* bools, const int* indices) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } + } + + } +} diff --git a/stream_compaction/common.h b/stream_compaction/common.h new file mode 100644 index 0000000..d2c1fed --- /dev/null +++ b/stream_compaction/common.h @@ -0,0 +1,132 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) + +/** + * Check for CUDA errors; print and exit if there was a problem. + */ +void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); + +inline int ilog2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return x == 1 ? 0 : ilog2(x - 1) + 1; +} + +namespace StreamCompaction { + namespace Common { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; + }; + } +} diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu new file mode 100644 index 0000000..4f4bdaa --- /dev/null +++ b/stream_compaction/cpu.cu @@ -0,0 +1,75 @@ +#include +#include "cpu.h" + +#include "common.h" + +namespace StreamCompaction { + namespace CPU { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + /** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + */ + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } + timer().endCpuTimer(); + } + + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + int index = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[index] = idata[i]; + index++; + } + } + timer().endCpuTimer(); + return index; + } + + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int *odata, const int *idata) { + //timer().startCpuTimer(); + // create a new array mapping the input array to zero's and one's + int* zerosAndOnes = new int[n]; + for (int i = 0; i < n; i++) { + idata[i] == 0 ? zerosAndOnes[i] = 0 : zerosAndOnes[i] = 1; + } + + // scan new array + int* scannedArray = new int[n]; + scan(n, scannedArray, zerosAndOnes); + + //scatter + for (int i = 0; i < n; i++) { + if (zerosAndOnes[i] == 1) { + odata[scannedArray[i]] = idata[i]; + } + } + + //timer().endCpuTimer(); + return scannedArray[n-1]; + } + } +} diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h new file mode 100644 index 0000000..873c047 --- /dev/null +++ b/stream_compaction/cpu.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace CPU { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + + int compactWithoutScan(int n, int *odata, const int *idata); + + int compactWithScan(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu new file mode 100644 index 0000000..42105a3 --- /dev/null +++ b/stream_compaction/efficient.cu @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include "common.h" +#include "efficient.h" + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + + +namespace StreamCompaction { + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + // up sweep + __global__ void upSweep(int n, int d, int* data, int dist, int distHalf) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n || index % dist != 0) { + return; + } + + int toUpdate = index + dist - 1; + int toGet = index + distHalf - 1; + + data[toUpdate] += data[toGet]; + } + + // up sweep efficient + __global__ void upSweepEfficient(int n, int d, int* data, int stride, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index >= n / stride) { + return; + } + + int toUpdate = ((index + 1) * stride) - 1; + int toGet = toUpdate - offset; + + data[toUpdate] += data[toGet]; + } + + // down sweep + __global__ void downSweep(int n, int d, int* data, int dist, int distHalf) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index % dist != 0) { + return; + } + + int t_index = index + distHalf - 1; + int replace_index = index + dist - 1; + + int t = data[t_index]; + data[t_index] = data[replace_index]; + data[replace_index] += t; + } + + // down sweep efficient + __global__ void downSweepEfficient(int n, int d, int* data, int stride, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index >= n / stride) { + return; + } + + int replace_index = n - 1 - (index * stride); + int t_index = replace_index - offset; + + + int t = data[t_index]; + data[t_index] = data[replace_index]; + data[replace_index] += t; + } + + // set n-1 to power of 2 values equal to 0 + __global__ void setZeros(int n, int power_of_2, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < power_of_2 && index >= n - 1) { + data[index] = 0; + } + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + int power_of_2 = 1; + while (power_of_2 < n) { + power_of_2 *= 2; + } + + // create array of size power of 2 + int* data; + + cudaMalloc((void**)&data, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data failed!"); + + // fill array and pad end with 0's + std::unique_ptrpadded_array{ new int[power_of_2] }; + cudaMemcpy(padded_array.get(), idata, sizeof(int) * n, cudaMemcpyHostToHost); + for (int i = n; i < power_of_2; i++) { + padded_array[i] = 0; + } + + cudaMemcpy(data, padded_array.get(), sizeof(int) * power_of_2, cudaMemcpyHostToDevice); + + // kernel values + int blockSize = 128; + dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + // up-sweep + for (int d = 0; d <= ilog2(power_of_2) - 1; d++) { + int dist = pow(2, d + 1); + int distHalf = pow(2, d); + upSweep << > > (power_of_2, d, data, dist, distHalf); + /*int stride = pow(2, d+1); + int offset = pow(2, d); + upSweepEfficient << > > (power_of_2, d, data, stride, offset);*/ + } + + + // set the last value to 0 + setZeros << > > (n, power_of_2, data); + + // down-sweep + for (int d = ilog2(power_of_2) - 1; d >= 0; d--) { + int dist = pow(2, d + 1); + int distHalf = pow(2, d); + downSweep << > > (power_of_2, d, data, dist, distHalf); + /*int stride = pow(2, d + 1); + int offset = pow(2, d); + downSweepEfficient << > > (power_of_2, d, data, stride, offset);*/ + } + timer().endGpuTimer(); + + // set the out data to the scanned data + cudaMemcpy(odata, data, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // free memory + cudaFree(data); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int* odata, const int* idata) { + // malloc necessary space oon GPU + int* gpu_idata; + int* bools; + int* scanned_data; + int* scattered_data; + + cudaMalloc((void**)&gpu_idata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc gpu_idata failed!"); + cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&bools, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc bools failed!"); + + cudaMalloc((void**)&scanned_data, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc scanned_data failed!"); + + cudaMalloc((void**)&scattered_data, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc scattered_data failed!"); + + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + //timer().startGpuTimer(); + // change to zeros and ones + Common::kernMapToBoolean << > > (n, bools, gpu_idata); + + // exclusive scan data + scan(n, scanned_data, bools); + + // scatter + Common::kernScatter << > > (n, scattered_data, gpu_idata, bools, scanned_data); + cudaMemcpy(odata, scattered_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + int num = n; + for (int i = 0; i < n; i++) { + if (odata[i] == 0) { + num = i; + break; + } + } + //timer().endGpuTimer(); + + // return last index in scanned_data + std::unique_ptrscanned_cpu{ new int[n] }; + cudaMemcpy(scanned_cpu.get(), scanned_data, sizeof(int) * num, cudaMemcpyDeviceToHost); + return num; + } + } +} diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h new file mode 100644 index 0000000..803cb4f --- /dev/null +++ b/stream_compaction/efficient.h @@ -0,0 +1,13 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Efficient { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu new file mode 100644 index 0000000..65446fe --- /dev/null +++ b/stream_compaction/naive.cu @@ -0,0 +1,92 @@ +#include +#include +#include "common.h" +#include "naive.h" + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +namespace StreamCompaction { + namespace Naive { + + + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + // one iteration of inclusive scan + __global__ void iteration(int n, int d, const int* idata, int* odata, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index >= offset) { + odata[index] = idata[index - offset] + idata[index]; + } else { + odata[index] = idata[index]; + } + } + + // turns inclusive scan to exclusive scane + __global__ void inclusiveToExclusive(int n, const int* idata, int* odata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index + 1 >= n) { + return; + } + if (index == 0) { + odata[0] = 0; + } + odata[index + 1] = idata[index]; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + int power_of_2 = 1; + while (power_of_2 < n) { + power_of_2 *= 2; + } + + // create arrays of size power of 2 + int* data_1; + int* data_2; + + cudaMalloc((void**)&data_1, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data_1 failed!"); + cudaMalloc((void**)&data_2, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data_2 failed!"); + + // fill array and pad end with 0's + cudaMemcpy(data_1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + // call kernel + int blockSize = 128; + dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize); + + //timer().startGpuTimer(); + for (int d = 1; d <= ilog2ceil(n); d++) { + int offset = pow(2, d - 1); + iteration << > > (power_of_2, d, data_1, data_2, offset); + int* temp = data_1; + data_1 = data_2; + data_2 = temp; + } + + inclusiveToExclusive << > > (power_of_2, data_1, data_2); + int* temp = data_1; + data_1 = data_2; + data_2 = temp; + //timer().endGpuTimer(); + + // set the out data to the scanned data + cudaMemcpy(odata, data_1, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // free memory + cudaFree(data_1); + cudaFree(data_2); + } + } +} diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h new file mode 100644 index 0000000..37dcb06 --- /dev/null +++ b/stream_compaction/naive.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Naive { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu new file mode 100644 index 0000000..3ceb705 --- /dev/null +++ b/stream_compaction/thrust.cu @@ -0,0 +1,38 @@ +#include +#include +#include +#include +#include +#include "common.h" +#include "thrust.h" + +namespace StreamCompaction { + namespace Thrust { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + thrust::host_vector hv_in(n); + thrust::device_vector dv_in = hv_in; + + for (int i = 0; i < n; i++) { + dv_in[i] = idata[i]; + } + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + for (int i = 0; i < n; i++) { + odata[i] = dv_out[i]; + } + } + } +} diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h new file mode 100644 index 0000000..fe98206 --- /dev/null +++ b/stream_compaction/thrust.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Thrust { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + } +}