diff --git a/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png b/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png
new file mode 100644
index 0000000..e928ab6
Binary files /dev/null and b/img/bloopers/cornell.2020-09-26_19-39-25z.17samp.png differ
diff --git a/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png b/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png
new file mode 100644
index 0000000..af6cab5
Binary files /dev/null and b/img/bloopers/cornell.2020-09-27_14-32-37z.8samp.png differ
diff --git a/src/interactions.h b/src/interactions.h
index 5ce3628..c8ff1a7 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "intersections.h"
+#include "glm/glm.hpp"
+#include "glm/gtx/norm.hpp"
 
 // CHECKITOUT
 /**
@@ -9,7 +11,7 @@
  */
 __host__ __device__
 glm::vec3 calculateRandomDirectionInHemisphere(
-        glm::vec3 normal, thrust::default_random_engine &rng) {
+    glm::vec3 normal, thrust::default_random_engine& rng) {
     thrust::uniform_real_distribution<float> u01(0, 1);
 
     float up = sqrt(u01(rng)); // cos(theta)
@@ -24,9 +26,11 @@ glm::vec3 calculateRandomDirectionInHemisphere(
     glm::vec3 directionNotNormal;
     if (abs(normal.x) < SQRT_OF_ONE_THIRD) {
         directionNotNormal = glm::vec3(1, 0, 0);
-    } else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
+    }
+    else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
         directionNotNormal = glm::vec3(0, 1, 0);
-    } else {
+    }
+    else {
         directionNotNormal = glm::vec3(0, 0, 1);
     }
 
@@ -47,11 +51,11 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  * A perfect specular surface scatters in the reflected ray direction.
  * In order to apply multiple effects to one surface, probabilistically choose
  * between them.
- * 
+ *
  * The visual effect you want is to straight-up add the diffuse and specular
  * components. You can do this in a few ways. This logic also applies to
  * combining other types of materias (such as refractive).
- * 
+ *
  * - Always take an even (50/50) split between a each effect (a diffuse bounce
  *   and a specular bounce), but divide the resulting color of either branch
  *   by its probability (0.5), to counteract the chance (0.5) of the branch
@@ -62,18 +66,32 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  *   branch result by that branch's probability (whatever probability you use).
  *
  * This method applies its changes to the Ray parameter `ray` in place.
- * It also modifies the color `color` of the ray in place.
+ * It also modifies the color `col or` of the ray in place.
  *
  * You may need to change the parameter list for your purposes!
  */
 __host__ __device__
 void scatterRay(
-		PathSegment & pathSegment,
-        glm::vec3 intersect,
-        glm::vec3 normal,
-        const Material &m,
-        thrust::default_random_engine &rng) {
+    PathSegment& pathSegment,
+    glm::vec3 intersect,
+    glm::vec3 normal,
+    const Material& m,
+    thrust::default_random_engine& rng) {
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+
+    //specular
+    if (m.hasReflective > 0) {
+        pathSegment.color *= m.specular.color;
+        pathSegment.ray.origin = intersect;
+        pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal);
+    }
+    //diffuse
+    else {
+        glm::vec3 rand_dir = calculateRandomDirectionInHemisphere(normal, rng);
+        pathSegment.color *= m.color;
+        pathSegment.ray.origin = intersect;
+        pathSegment.ray.direction = rand_dir;
+    }
 }
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index c1ec122..df7761b 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -4,6 +4,8 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
+#include <thrust/partition.h>
+#include <thrust/execution_policy.h>
 
 #include "sceneStructs.h"
 #include "scene.h"
@@ -18,7 +20,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char* msg, const char* file, int line) {
 #if ERRORCHECK
     cudaDeviceSynchronize();
     cudaError_t err = cudaGetLastError();
@@ -46,7 +48,7 @@ thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int de
 
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
+    int iter, glm::vec3* image) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
@@ -55,9 +57,9 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
         glm::vec3 pix = image[index];
 
         glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
+        color.x = glm::clamp((int)(pix.x / iter * 255.0), 0, 255);
+        color.y = glm::clamp((int)(pix.y / iter * 255.0), 0, 255);
+        color.z = glm::clamp((int)(pix.z / iter * 255.0), 0, 255);
 
         // Each thread writes one pixel location in the texture (textel)
         pbo[index].w = 0;
@@ -67,33 +69,33 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
     }
 }
 
-static Scene * hst_scene = NULL;
-static glm::vec3 * dev_image = NULL;
-static Geom * dev_geoms = NULL;
-static Material * dev_materials = NULL;
-static PathSegment * dev_paths = NULL;
-static ShadeableIntersection * dev_intersections = NULL;
+static Scene* hst_scene = NULL;
+static glm::vec3* dev_image = NULL;
+static Geom* dev_geoms = NULL;
+static Material* dev_materials = NULL;
+static PathSegment* dev_paths = NULL;
+static ShadeableIntersection* dev_intersections = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
 
-void pathtraceInit(Scene *scene) {
+void pathtraceInit(Scene* scene) {
     hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
+    const Camera& cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
 
     cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
     cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
 
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+    cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
 
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+    cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+    cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
 
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+    cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+    cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
 
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+    cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+    cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
     // TODO: initialize any extra device memeory you need
 
@@ -102,10 +104,10 @@ void pathtraceInit(Scene *scene) {
 
 void pathtraceFree() {
     cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
+    cudaFree(dev_paths);
+    cudaFree(dev_geoms);
+    cudaFree(dev_materials);
+    cudaFree(dev_intersections);
     // TODO: clean up any extra device memory you created
 
     checkCUDAError("pathtraceFree");
@@ -121,25 +123,25 @@ void pathtraceFree() {
 */
 __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
 {
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
-	if (x < cam.resolution.x && y < cam.resolution.y) {
-		int index = x + (y * cam.resolution.x);
-		PathSegment & segment = pathSegments[index];
+    if (x < cam.resolution.x && y < cam.resolution.y) {
+        int index = x + (y * cam.resolution.x);
+        PathSegment& segment = pathSegments[index];
 
-		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+        segment.ray.origin = cam.position;
+        segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
-		// TODO: implement antialiasing by jittering the ray
-		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
+        // TODO: implement antialiasing by jittering the ray
+        segment.ray.direction = glm::normalize(cam.view
+            - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
+            - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
+        );
 
-		segment.pixelIndex = index;
-		segment.remainingBounces = traceDepth;
-	}
+        segment.pixelIndex = index;
+        segment.remainingBounces = traceDepth;
+    }
 }
 
 // TODO:
@@ -147,153 +149,148 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 // Generating new rays is handled in your shader(s).
 // Feel free to modify the code below.
 __global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment * pathSegments
-	, Geom * geoms
-	, int geoms_size
-	, ShadeableIntersection * intersections
-	)
+    int depth
+    , int num_paths
+    , PathSegment* pathSegments
+    , Geom* geoms
+    , int geoms_size
+    , ShadeableIntersection* intersections
+)
 {
-	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (path_index < num_paths)
-	{
-		PathSegment pathSegment = pathSegments[path_index];
-
-		float t;
-		glm::vec3 intersect_point;
-		glm::vec3 normal;
-		float t_min = FLT_MAX;
-		int hit_geom_index = -1;
-		bool outside = true;
-
-		glm::vec3 tmp_intersect;
-		glm::vec3 tmp_normal;
-
-		// naive parse through global geoms
-
-		for (int i = 0; i < geoms_size; i++)
-		{
-			Geom & geom = geoms[i];
-
-			if (geom.type == CUBE)
-			{
-				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
-			}
-			else if (geom.type == SPHERE)
-			{
-				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
-			}
-			// TODO: add more intersection tests here... triangle? metaball? CSG?
-
-			// Compute the minimum t from the intersection tests to determine what
-			// scene geometry object was hit first.
-			if (t > 0.0f && t_min > t)
-			{
-				t_min = t;
-				hit_geom_index = i;
-				intersect_point = tmp_intersect;
-				normal = tmp_normal;
-			}
-		}
-
-		if (hit_geom_index == -1)
-		{
-			intersections[path_index].t = -1.0f;
-		}
-		else
-		{
-			//The ray hits something
-			intersections[path_index].t = t_min;
-			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
-			intersections[path_index].surfaceNormal = normal;
-		}
-	}
+    int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (path_index < num_paths)
+    {
+        PathSegment pathSegment = pathSegments[path_index];
+
+        float t;
+        glm::vec3 intersect_point;
+        glm::vec3 normal;
+        float t_min = FLT_MAX;
+        int hit_geom_index = -1;
+        bool outside = true;
+
+        glm::vec3 tmp_intersect;
+        glm::vec3 tmp_normal;
+
+        // naive parse through global geoms
+
+        for (int i = 0; i < geoms_size; i++)
+        {
+            Geom& geom = geoms[i];
+
+            if (geom.type == CUBE)
+            {
+                t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+            }
+            else if (geom.type == SPHERE)
+            {
+                t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+            }
+            // TODO: add more intersection tests here... triangle? metaball? CSG?
+
+            // Compute the minimum t from the intersection tests to determine what
+            // scene geometry object was hit first.
+            if (t > 0.0f && t_min > t)
+            {
+                t_min = t;
+                hit_geom_index = i;
+                intersect_point = tmp_intersect;
+                normal = tmp_normal;
+            }
+        }
+
+        if (hit_geom_index == -1)
+        {
+            intersections[path_index].t = -1.0f;
+        }
+        else
+        {
+            //The ray hits something
+            intersections[path_index].t = t_min;
+            intersections[path_index].materialId = geoms[hit_geom_index].materialid;
+            intersections[path_index].surfaceNormal = normal;
+        }
+    }
 }
 
-// LOOK: "fake" shader demonstrating what you might do with the info in
-// a ShadeableIntersection, as well as how to use thrust's random number
-// generator. Observe that since the thrust random number generator basically
-// adds "noise" to the iteration, the image should start off noisy and get
-// cleaner as more iterations are computed.
-//
-// Note that this shader does NOT do a BSDF evaluation!
-// Your shaders should handle that - this can allow techniques such as
-// bump mapping.
-__global__ void shadeFakeMaterial (
-  int iter
-  , int num_paths
-	, ShadeableIntersection * shadeableIntersections
-	, PathSegment * pathSegments
-	, Material * materials
-	)
+__global__ void shadeMaterial(
+    int iter
+    , int num_paths
+    , ShadeableIntersection* shadeableIntersections
+    , PathSegment* pathSegments
+    , Material* materials
+)
 {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      // Set up the RNG
-      // LOOK: this is how you use thrust's RNG! Please look at
-      // makeSeededRandomEngine as well.
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
-      thrust::uniform_real_distribution<float> u01(0, 1);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        pathSegments[idx].color *= (materialColor * material.emittance);
-      }
-      // Otherwise, do some pseudo-lighting computation. This is actually more
-      // like what you would expect from shading in a rasterizer like OpenGL.
-      // TODO: replace this! you should be able to start with basically a one-liner
-      else {
-        float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
-        pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
-        pathSegments[idx].color *= u01(rng); // apply some noise because why not
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      pathSegments[idx].color = glm::vec3(0.0f);
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_paths)
+    {
+        ShadeableIntersection intersection = shadeableIntersections[idx];
+        if (intersection.t > 0.0f) { // if the intersection exists...
+            thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
+            thrust::uniform_real_distribution<float> u01(0, 1);
+
+            Material material = materials[intersection.materialId];
+            glm::vec3 materialColor = material.color;
+
+            // If the material indicates that the object was a light, "light" the ray
+            if (material.emittance > 0.0f) {
+                pathSegments[idx].color *= (materialColor * material.emittance);
+                pathSegments[idx].remainingBounces = -1;
+            }
+            else {
+                scatterRay(pathSegments[idx], pathSegments[idx].ray.origin + pathSegments[idx].ray.direction * intersection.t,
+                    intersection.surfaceNormal, material, rng);
+                pathSegments[idx].remainingBounces -= 1;
+            }
+        }
+        else {
+            pathSegments[idx].color = glm::vec3(0.0f);
+            pathSegments[idx].remainingBounces = -1;
+        }
+        
     }
-  }
 }
 
 // Add the current iteration's output to the overall image
-__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
+__global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths)
 {
-	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-	if (index < nPaths)
-	{
-		PathSegment iterationPath = iterationPaths[index];
-		image[iterationPath.pixelIndex] += iterationPath.color;
-	}
+    if (index < nPaths)
+    {
+        PathSegment iterationPath = iterationPaths[index];
+        image[iterationPath.pixelIndex] += iterationPath.color;
+    }
 }
 
+struct should_terminate
+{
+    __host__ __device__
+        bool operator()(const PathSegment& pathSegment)
+    {
+        return (pathSegment.remainingBounces > 0);
+    }
+};
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
  */
-void pathtrace(uchar4 *pbo, int frame, int iter) {
+void pathtrace(uchar4* pbo, int frame, int iter) {
     const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
+    //const int traceDepth = 200;
+    const Camera& cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
 
-	// 2D block for generating ray from camera
+    // 2D block for generating ray from camera
     const dim3 blockSize2d(8, 8);
     const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+        (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+        (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
-	// 1D block for path tracing
-	const int blockSize1d = 128;
+    // 1D block for path tracing
+    const int blockSize1d = 128;
 
     ///////////////////////////////////////////////////////////////////////////
 
@@ -326,68 +323,69 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
 
     // TODO: perform one iteration of path tracing
 
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
-
-	int depth = 0;
-	PathSegment* dev_path_end = dev_paths + pixelcount;
-	int num_paths = dev_path_end - dev_paths;
-
-	// --- PathSegment Tracing Stage ---
-	// Shoot ray into scene, bounce between objects, push shading chunks
-
-  bool iterationComplete = false;
-	while (!iterationComplete) {
-
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-	depth++;
-
-
-	// TODO:
-	// --- Shading Stage ---
-	// Shade path segments based on intersections and generate new rays by
-  // evaluating the BSDF.
-  // Start off with just a big kernel that handles all the different
-  // materials you have in the scenefile.
-  // TODO: compare between directly shading the path segments and shading
-  // path segments that have been reshuffled to be contiguous in memory.
-
-  shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = true; // TODO: should be based off stream compaction results.
-	}
-
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+    generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
+    checkCUDAError("generate camera ray");
+
+    int depth = 0;
+    PathSegment* dev_path_end = dev_paths + pixelcount;
+    int num_paths = dev_path_end - dev_paths;
+    int num_paths_start = num_paths;
+
+    // --- PathSegment Tracing Stage ---
+    // Shoot ray into scene, bounce between objects, push shading chunks
+
+    bool iterationComplete = false;
+    while (!iterationComplete) {
+
+        // clean shading chunks
+        cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+        // tracing
+        dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+        computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (depth, num_paths, dev_paths, dev_geoms
+            , hst_scene->geoms.size(), dev_intersections);
+        checkCUDAError("trace one bounce");
+        cudaDeviceSynchronize();
+        depth++;
+
+
+        // TODO:
+        // --- Shading Stage ---
+        // Shade path segments based on intersections and generate new rays by
+      // evaluating the BSDF.
+      // Start off with just a big kernel that handles all the different
+      // materials you have in the scenefile.
+      // TODO: compare between directly shading the path segments and shading
+      // path segments that have been reshuffled to be contiguous in memory.
+
+        shadeMaterial << <numblocksPathSegmentTracing, blockSize1d >> > (
+            iter,
+            num_paths,
+            dev_intersections,
+            dev_paths,
+            dev_materials
+            );
+
+        PathSegment* dev_path_end = thrust::stable_partition(thrust::device, dev_paths, dev_paths + num_paths, should_terminate());
+        num_paths = dev_path_end - dev_paths;
+
+        if (num_paths == 0) {
+            iterationComplete = true;
+        }
+    }
+
+    // Assemble this iteration and apply it to the image
+    dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+    finalGather << <numBlocksPixels, blockSize1d >> > (num_paths_start, dev_image, dev_paths);
 
     ///////////////////////////////////////////////////////////////////////////
 
     // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
+    sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_image);
 
     // Retrieve image from GPU
     cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+        pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
     checkCUDAError("pathtrace");
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 4538f04..567795b 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,6 +1,23 @@
-set(SOURCE_FILES
+set(headers
+    "common.h"
+    "cpu.h"
+    "naive.h"
+    "efficient.h"
+    "thrust.h"
     )
 
-cuda_add_library(stream_compaction
-    ${SOURCE_FILES}
+set(sources
+    "common.cu"
+    "cpu.cu"
+    "naive.cu"
+    "efficient.cu"
+    "thrust.cu"
     )
+
+list(SORT headers)
+list(SORT sources)
+
+source_group(Headers FILES ${headers})
+source_group(Sources FILES ${sources})
+
+cuda_add_library(stream_compaction ${sources} ${headers})
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..e77300b
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,51 @@
+#include "common.h"
+
+void checkCUDAErrorFn(const char* msg, const char* file, int line) {
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess == err) {
+        return;
+    }
+
+    fprintf(stderr, "CUDA error");
+    if (file) {
+        fprintf(stderr, " (%s:%d)", file, line);
+    }
+    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+}
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index] == 0 ? 0 : 1;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int* odata,
+            const int* idata, const int* bools, const int* indices) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
+        }
+
+    }
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..d2c1fed
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+        /**
+        * This class is used for timing the performance
+        * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+        */
+        class PerformanceTimer
+        {
+        public:
+            PerformanceTimer()
+            {
+                cudaEventCreate(&event_start);
+                cudaEventCreate(&event_end);
+            }
+
+            ~PerformanceTimer()
+            {
+                cudaEventDestroy(event_start);
+                cudaEventDestroy(event_end);
+            }
+
+            void startCpuTimer()
+            {
+                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                cpu_timer_started = true;
+
+                time_start_cpu = std::chrono::high_resolution_clock::now();
+            }
+
+            void endCpuTimer()
+            {
+                time_end_cpu = std::chrono::high_resolution_clock::now();
+
+                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+                prev_elapsed_time_cpu_milliseconds =
+                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+                cpu_timer_started = false;
+            }
+
+            void startGpuTimer()
+            {
+                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                gpu_timer_started = true;
+
+                cudaEventRecord(event_start);
+            }
+
+            void endGpuTimer()
+            {
+                cudaEventRecord(event_end);
+                cudaEventSynchronize(event_end);
+
+                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+                gpu_timer_started = false;
+            }
+
+            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+            {
+                return prev_elapsed_time_cpu_milliseconds;
+            }
+
+            float getGpuElapsedTimeForPreviousOperation() //noexcept
+            {
+                return prev_elapsed_time_gpu_milliseconds;
+            }
+
+            // remove copy and move functions
+            PerformanceTimer(const PerformanceTimer&) = delete;
+            PerformanceTimer(PerformanceTimer&&) = delete;
+            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+        private:
+            cudaEvent_t event_start = nullptr;
+            cudaEvent_t event_end = nullptr;
+
+            using time_point_t = std::chrono::high_resolution_clock::time_point;
+            time_point_t time_start_cpu;
+            time_point_t time_end_cpu;
+
+            bool cpu_timer_started = false;
+            bool gpu_timer_started = false;
+
+            float prev_elapsed_time_cpu_milliseconds = 0.f;
+            float prev_elapsed_time_gpu_milliseconds = 0.f;
+        };
+    }
+}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
new file mode 100644
index 0000000..4f4bdaa
--- /dev/null
+++ b/stream_compaction/cpu.cu
@@ -0,0 +1,75 @@
+#include <cstdio>
+#include "cpu.h"
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace CPU {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        /**
+         * CPU scan (prefix sum).
+         * For performance analysis, this is supposed to be a simple for loop.
+         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+         */
+        void scan(int n, int *odata, const int *idata) {
+            timer().startCpuTimer();
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = idata[i - 1] + odata[i - 1];
+            }
+            timer().endCpuTimer();
+        }
+
+        /**
+         * CPU stream compaction without using the scan function.
+         *
+         * @returns the number of elements remaining after compaction.
+         */
+        int compactWithoutScan(int n, int *odata, const int *idata) {
+            timer().startCpuTimer();
+            int index = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[index] = idata[i];
+                    index++;
+                }
+            }
+            timer().endCpuTimer();
+            return index;
+        }
+
+        /**
+         * CPU stream compaction using scan and scatter, like the parallel version.
+         *
+         * @returns the number of elements remaining after compaction.
+         */
+        int compactWithScan(int n, int *odata, const int *idata) {
+            //timer().startCpuTimer();
+            // create a new array mapping the input array to zero's and one's
+            int* zerosAndOnes = new int[n];
+            for (int i = 0; i < n; i++) {
+                idata[i] == 0 ? zerosAndOnes[i] = 0 : zerosAndOnes[i] = 1;
+            }
+            
+            // scan new array
+            int* scannedArray = new int[n];
+            scan(n, scannedArray, zerosAndOnes);
+
+            //scatter
+            for (int i = 0; i < n; i++) {
+                if (zerosAndOnes[i] == 1) {
+                    odata[scannedArray[i]] = idata[i];
+                }
+            }
+            
+            //timer().endCpuTimer();
+            return scannedArray[n-1];
+        }
+    }
+}
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
new file mode 100644
index 0000000..873c047
--- /dev/null
+++ b/stream_compaction/cpu.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace CPU {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compactWithoutScan(int n, int *odata, const int *idata);
+
+        int compactWithScan(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
new file mode 100644
index 0000000..42105a3
--- /dev/null
+++ b/stream_compaction/efficient.cu
@@ -0,0 +1,204 @@
+#include <iostream>
+#include <memory>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
+
+namespace StreamCompaction {
+    namespace Efficient {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        // up sweep
+        __global__ void upSweep(int n, int d, int* data, int dist, int distHalf) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            
+            if (index >= n || index % dist != 0) {
+                return;
+            }
+
+            int toUpdate = index + dist - 1;
+            int toGet = index + distHalf - 1;
+
+            data[toUpdate] += data[toGet];
+        }
+
+        // up sweep efficient
+        __global__ void upSweepEfficient(int n, int d, int* data, int stride, int offset) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n || index >= n / stride) {
+                return;
+            }
+
+            int toUpdate = ((index + 1) * stride) - 1;
+            int toGet = toUpdate - offset;
+
+            data[toUpdate] += data[toGet];
+        }
+
+        // down sweep
+        __global__ void downSweep(int n, int d, int* data, int dist, int distHalf) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n || index % dist != 0) {
+                return;
+            }
+
+            int t_index = index + distHalf - 1;
+            int replace_index = index + dist - 1;
+
+            int t = data[t_index];
+            data[t_index] = data[replace_index];
+            data[replace_index] += t;
+        }
+
+        // down sweep efficient
+        __global__ void downSweepEfficient(int n, int d, int* data, int stride, int offset) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n || index >= n / stride) {
+                return;
+            }
+
+            int replace_index = n - 1 - (index * stride);
+            int t_index = replace_index - offset;
+            
+
+            int t = data[t_index];
+            data[t_index] = data[replace_index];
+            data[replace_index] += t;
+        }
+
+        // set n-1 to power of 2 values equal to 0
+        __global__ void setZeros(int n, int power_of_2, int* data) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < power_of_2 && index >= n - 1) {
+                data[index] = 0;
+            }
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int* odata, const int* idata) {
+            int power_of_2 = 1;
+            while (power_of_2 < n) {
+                power_of_2 *= 2;
+            }
+
+            // create array of size power of 2
+            int* data;
+
+            cudaMalloc((void**)&data, power_of_2 * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc data failed!");
+
+            // fill array and pad end with 0's
+            std::unique_ptr<int[]>padded_array{ new int[power_of_2] };
+            cudaMemcpy(padded_array.get(), idata, sizeof(int) * n, cudaMemcpyHostToHost);
+            for (int i = n; i < power_of_2; i++) {
+                padded_array[i] = 0;
+            }
+
+            cudaMemcpy(data, padded_array.get(), sizeof(int) * power_of_2, cudaMemcpyHostToDevice);
+
+            // kernel values
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize);
+
+            timer().startGpuTimer();
+            // up-sweep
+            for (int d = 0; d <= ilog2(power_of_2) - 1; d++) {
+                int dist = pow(2, d + 1);
+                int distHalf = pow(2, d);
+                upSweep << <fullBlocksPerGrid, blockSize >> > (power_of_2, d, data, dist, distHalf);
+                /*int stride = pow(2, d+1);
+                int offset = pow(2, d);
+                upSweepEfficient << <fullBlocksPerGrid, blockSize >> > (power_of_2, d, data, stride, offset);*/
+            }
+
+
+            // set the last value to 0
+            setZeros << <fullBlocksPerGrid, blockSize >> > (n, power_of_2, data);
+
+            // down-sweep
+            for (int d = ilog2(power_of_2) - 1; d >= 0; d--) {
+                int dist = pow(2, d + 1);
+                int distHalf = pow(2, d);
+                downSweep << <fullBlocksPerGrid, blockSize >> > (power_of_2, d, data, dist, distHalf);
+                /*int stride = pow(2, d + 1);
+                int offset = pow(2, d);
+                downSweepEfficient << <fullBlocksPerGrid, blockSize >> > (power_of_2, d, data, stride, offset);*/
+            }
+            timer().endGpuTimer();
+
+            // set the out data to the scanned data
+            cudaMemcpy(odata, data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(data);
+        }
+
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+         */
+        int compact(int n, int* odata, const int* idata) {
+            // malloc necessary space oon GPU
+            int* gpu_idata;
+            int* bools;
+            int* scanned_data;
+            int* scattered_data;
+
+            cudaMalloc((void**)&gpu_idata, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc gpu_idata failed!");
+            cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            cudaMalloc((void**)&bools, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc bools failed!");
+
+            cudaMalloc((void**)&scanned_data, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc scanned_data failed!");
+
+            cudaMalloc((void**)&scattered_data, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc scattered_data failed!");
+
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            //timer().startGpuTimer();
+            // change to zeros and ones
+            Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, bools, gpu_idata);
+
+            // exclusive scan data
+            scan(n, scanned_data, bools);
+
+            // scatter
+            Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, scattered_data, gpu_idata, bools, scanned_data);
+            cudaMemcpy(odata, scattered_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            int num = n;
+            for (int i = 0; i < n; i++) {
+                if (odata[i] == 0) {
+                    num = i;
+                    break;
+                }
+            }
+            //timer().endGpuTimer();
+
+            // return last index in scanned_data
+            std::unique_ptr<int[]>scanned_cpu{ new int[n] };
+            cudaMemcpy(scanned_cpu.get(), scanned_data, sizeof(int) * num, cudaMemcpyDeviceToHost);
+            return num;
+        }
+    }
+}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
new file mode 100644
index 0000000..803cb4f
--- /dev/null
+++ b/stream_compaction/efficient.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compact(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
new file mode 100644
index 0000000..65446fe
--- /dev/null
+++ b/stream_compaction/naive.cu
@@ -0,0 +1,92 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "naive.h"
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
+namespace StreamCompaction {
+    namespace Naive {
+
+
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        // one iteration of inclusive scan
+        __global__ void iteration(int n, int d, const int* idata, int* odata, int offset) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+            if (index >= offset) {
+                odata[index] = idata[index - offset] + idata[index];
+            } else {
+                odata[index] = idata[index];
+            }
+        }
+
+        // turns inclusive scan to exclusive scane
+        __global__ void inclusiveToExclusive(int n, const int* idata, int* odata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index + 1 >= n) {
+                return;
+            }
+            if (index == 0) {
+                odata[0] = 0;
+            }
+            odata[index + 1] = idata[index];
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int* odata, const int* idata) {
+            int power_of_2 = 1;
+            while (power_of_2 < n) {
+                power_of_2 *= 2;
+            }
+
+            // create arrays of size power of 2
+            int* data_1;
+            int* data_2;
+
+            cudaMalloc((void**)&data_1, power_of_2 * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc data_1 failed!");
+            cudaMalloc((void**)&data_2, power_of_2 * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc data_2 failed!");
+
+            // fill array and pad end with 0's
+            cudaMemcpy(data_1, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            // call kernel
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize);
+
+            //timer().startGpuTimer();
+            for (int d = 1; d <= ilog2ceil(n); d++) {
+                int offset = pow(2, d - 1);
+                iteration << <fullBlocksPerGrid, blockSize >> > (power_of_2, d, data_1, data_2, offset);
+                int* temp = data_1;
+                data_1 = data_2;
+                data_2 = temp;
+            }
+
+            inclusiveToExclusive << <fullBlocksPerGrid, blockSize >> > (power_of_2, data_1, data_2);
+            int* temp = data_1;
+            data_1 = data_2;
+            data_2 = temp;
+            //timer().endGpuTimer();
+
+            // set the out data to the scanned data
+            cudaMemcpy(odata, data_1, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(data_1);
+            cudaFree(data_2);
+        }
+    }
+}
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
new file mode 100644
index 0000000..37dcb06
--- /dev/null
+++ b/stream_compaction/naive.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Naive {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+    }
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
new file mode 100644
index 0000000..3ceb705
--- /dev/null
+++ b/stream_compaction/thrust.cu
@@ -0,0 +1,38 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include "common.h"
+#include "thrust.h"
+
+namespace StreamCompaction {
+    namespace Thrust {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> hv_in(n);
+            thrust::device_vector<int> dv_in = hv_in;
+
+            for (int i = 0; i < n; i++) {
+                dv_in[i] = idata[i];
+            }
+            thrust::device_vector<int> dv_out(n);
+
+            timer().startGpuTimer();
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            timer().endGpuTimer();
+
+            for (int i = 0; i < n; i++) {
+                odata[i] = dv_out[i];
+            }
+        }
+    }
+}
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
new file mode 100644
index 0000000..fe98206
--- /dev/null
+++ b/stream_compaction/thrust.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Thrust {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+    }
+}