-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.cu
579 lines (468 loc) · 20.9 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include "bmpwriter.h"
#include <vector>
#include <fstream>
#include <sstream>
// macros for sizes
#define num_triangles 36
#define scr_w 512
#define scr_h 512
#define triangles_per_load 256
#define passes_needed num_triangles / triangles_per_load + 1
#define fov 0.003f
// tracer related macros
#define num_bounces 8
#define num_frames 1000
#define blur 0.01f
#define smoothing_constant 0.3
// macros for gpu params
#define threads_main 512
#define blocks_main scr_w * scr_h / threads_main
// macros to replace functions
#define dot(vec3_v1, vec3_v2) (vec3_v1.x * vec3_v2.x + vec3_v1.y * vec3_v2.y + vec3_v1.z * vec3_v2.z)
#define matrix2D_eval(float_a , float_b, float_c, float_d) (float_a*float_d - float_b*float_c)
#define matgnitude(vec3_a) (sqrtf(dot(vec3_a, vec3_a)))
// too lazy to set up cudas rng so i use this bad one
inline __host__ __device__ float xorRand(unsigned int seed) {
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return seed;
}
inline __device__ float xorRandf(unsigned int seed) {
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return ((seed % 1000)/999.0f);
}
inline __device__ float randomValNormalDistribution(const long int state) {
const float rnd = xorRandf(state);
const float rnd2 = xorRandf(state ^ 1023012);
return __fsqrt_rn(-2 * __logf(rnd)) * __cosf(2 * 3.141592653 * rnd2);
}
// Define the vec3 struct
struct vec3 {
float x, y, z;
__host__ __device__ vec3() : x(0), y(0), z(0) {}
__host__ __device__ vec3(float x, float y, float z) : x(x), y(y), z(z) {}
inline __host__ __device__ vec3 operator+(const vec3& f) const {
return vec3(x + f.x, y + f.y, z + f.z);
}
inline __host__ __device__ vec3 operator-(const vec3& f) const {
return vec3(x - f.x, y - f.y, z - f.z);
}
inline __host__ __device__ vec3 operator*(const float scalar) const {
return vec3(x * scalar, y * scalar, z * scalar);
}
inline __host__ __device__ vec3 normalize() {
const float scl = matgnitude((*this));
return vec3(x / scl, y / scl, z / scl);
}
};
// cross is more logical as its own function
inline __host__ __device__ vec3 cross(const vec3 v1, const vec3 v2) {
vec3 ret;
ret.x = matrix2D_eval(v1.y, v1.z, v2.y, v2.z);
ret.y = matrix2D_eval(v1.x, v1.z, v2.x, v2.z);
ret.z = matrix2D_eval(v1.x, v1.y, v2.x, v2.y);
return ret;
}
// structs
struct color{
float r, g, b;
__host__ __device__ color(float R, float G, float B) : r(R), g(G), b(B){}
inline __host__ __device__ color operator+(const color& f) const {
return color(r + f.r, g + f.g, b + f.b);
}
inline __host__ __device__ color operator*(const float f) const {
return color(r * f, g * f, b * f);
}
inline __host__ __device__ color operator*(const color c) const {
return color(r * c.r, g * c.g, b * c.b);
}
};
struct material {
color c;
float brightness, roughness;
__host__ __device__ material() : c(color(0.0f, 0.0f, 0.0f)){}
__host__ __device__ material(color C, float B, float rough) : c(C), brightness(B), roughness(rough){}
};
struct ray{
vec3 origin, direction;
__host__ __device__ ray() : origin(vec3(0.0f, 0.0f, 0.0f)), direction(vec3(0.0f, 0.0f, 0.0f)){}
__host__ __device__ ray(vec3 origin, vec3 direction) : origin(origin), direction(direction){}
};
struct triangle{
vec3 p1, p2, p3;
vec3 nv;
vec3 sb21, sb31;
float dot2121, dot2131, dot3131;
bool unbounded;
__host__ __device__ triangle() : p1(vec3(0.0f, 0.0f, 0.0f)), p2(vec3(0.0f, 0.0f, 0.0f)), p3(vec3(0.0f, 0.0f, 0.0f)){}
__host__ __device__ triangle(vec3 P1, vec3 P2, vec3 P3, bool u) {
p1 = P1;
p2 = P2;
p3 = P3;
sb21 = p2 - p1;
sb31 = p3 - p1;
dot2121 = dot(sb21, sb21);
dot2131 = dot(sb21, sb31);
dot3131 = dot(sb31, sb31);
nv = cross(sb21, sb31).normalize();
unbounded = u;
}
};
// global device arrs
// init as chars to bypass restrictions on dynamic initialization
__device__ char triangles[num_triangles * sizeof(triangle)]; // all triangles(on global mem, so slow access)
__device__ char triangle_materials[num_triangles * sizeof(material)]; // materials corresponding to triangles
__device__ char screen_buffer[scr_w * scr_h * sizeof(color)];
typedef struct {
vec3 intersect;
int triangle_index;
float dist_from_origin;
vec3 nv;
}intersect_return;
union fbitwise {
float f;
unsigned int s;
};
// intersect funcs that will be put in kernel
inline __device__ intersect_return find_closest_int(const triangle triangles_loaded[triangles_per_load], const ray r, const int tris_read) {
intersect_return ret;
ret.triangle_index = -1;
float closest_dist = -1.0f;
unsigned int closest_ind;
for (unsigned int t = 0; t < tris_read /*tris_read used bc not all triangles in array may be intialized*/; t++) {
fbitwise disc;
disc.f = dot(r.direction, triangles_loaded[t].nv);
const float dt = disc.s && 0x7FFFFFFF; // make float positive
if (dt <= FLT_EPSILON) { // check if the plane and ray are paralell enough to be ignored
continue;
}
vec3 temp_sub = triangles_loaded[t].p1 - r.origin;
temp_sub = r.direction * __fdividef(dot(triangles_loaded[t].nv, temp_sub), disc.f);// fast division since fastmath doesnt work on my system for some reason
ret.intersect = r.origin + temp_sub;
const vec3 v2 = ret.intersect - triangles_loaded[t].p1;
const float dot02 = dot(triangles_loaded[t].sb21, v2);
const float dot12 = dot(triangles_loaded[t].sb31, v2);
const float disc2 = (triangles_loaded[t].dot2121 * triangles_loaded[t].dot3131 - triangles_loaded[t].dot2131 * triangles_loaded[t].dot2131);
if (disc2 == 0.0f) { continue; }
const float fdiv = __fdividef(1.0f, disc2);
const float u = __fmul_rn(__fmaf_rn(triangles_loaded[t].dot3131, dot02, -triangles_loaded[t].dot2131 * dot12), fdiv);
const float v = __fmul_rn(__fmaf_rn(triangles_loaded[t].dot2121, dot12, -triangles_loaded[t].dot2131 * dot02), fdiv);
if ((((u < 0) || (v < 0) || (u + v > 1) || dot(temp_sub, r.direction) < 0.0f)) && !triangles_loaded[t].unbounded) { continue; }
float new_dist = matgnitude(temp_sub);
if (new_dist < closest_dist || (ret.triangle_index == -1)) {
closest_dist = new_dist;
closest_ind = t;
ret.triangle_index = t;
ret.nv = triangles_loaded[t].nv;
}
}
ret.dist_from_origin = closest_dist;
return ret;
}
// other stuff for kernel organization
__constant__ char triangle_loader[triangles_per_load * sizeof(triangle)];
inline __device__ intersect_return get_closest_intersect_in_load(const int pass, const ray r) {
// wrapper, might be removed in future due to overhead
const int id = threadIdx.x + blockIdx.x * blockDim.x;
// load triangles into cached mem
// constant memory is generally faster than shared if all threads need to access it(which is the case here)
if (id < num_triangles && id < triangles_per_load) {
((triangle*)triangle_loader)[id] = ((triangle*)triangles)[id + pass * triangles_per_load];
}
//__syncthreads();
const int tbd = num_triangles - pass * triangles_per_load;
return find_closest_int((triangle*)triangle_loader, r, (tbd < triangles_per_load) * (tbd - triangles_per_load) + triangles_per_load);
}
__device__ ray reflect_ray(ray r, vec3 nv, const vec3 intersect, const float random_strength, const unsigned int iteration) {
// Specular reflection
fbitwise dt;
dt.f = dot(r.direction, nv);
//dt.s &= 0x7FFFFFFF; // Ensure dt is positive
const vec3 dir = r.direction - nv * (2 * dt.f);
const unsigned int state = (threadIdx.x + blockIdx.x * blockDim.x) * (iteration + 1);
const vec3 ran = vec3(randomValNormalDistribution(state), randomValNormalDistribution(state * 2), randomValNormalDistribution(state * 3)).normalize();
r.direction = ((dir * (1.0f - random_strength)) + (ran * random_strength)).normalize();
return r;
}
inline __device__ ray initialize_rays(const int idx, const int iteration) {
const float x = idx % scr_w - scr_w / 2;
const float y = idx / scr_w - scr_h / 2;
const int rx = xorRand(idx * iteration);
const int ry = xorRand(rx);
const vec3 rv = vec3((rx % 1000) / 999.0f - 0.5f, (ry % 1000) / 999.0f - 0.5f, 0.0f) * blur + vec3(x * fov, y * fov, 1.0f).normalize();
return ray(vec3(x, y, 0.0f), rv);
}
// reused file write func
inline FILE* open_file(const char* filename) {
FILE* ret = fopen(filename, "w");
if (ret == NULL) {
printf("%s\n", "error opening file %s\n", filename);
return NULL;
}
return ret;
}
void write_pixel_data_to_txt(const color* color_buffer) {
unsigned char* pixels;
FILE* f = open_file("colorReturnFile.txt");
for (int l = 0; l < scr_w * scr_h; l++) {
fprintf(f, "%f,%f,%f\n", color_buffer[l].r, color_buffer[l].g, color_buffer[l].b);
}
fclose(f);
}
// color stuff
inline __device__ void add_color(const int index, const color c) {
((color*)screen_buffer)[index] = ((color*)screen_buffer)[index] + c;
}
inline __device__ void scale_color(const int index, const float f) {
((color*)screen_buffer)[index] = ((color*)screen_buffer)[index] * f;
}
// kernel!
__global__ void updateKernel(const int iteration) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// init ray at starting point
ray r = initialize_rays(idx, iteration);
intersect_return ret;
unsigned char num_hits = 0;
color c = color(1.0f, 1.0f, 1.0f);
material m;
for (int b = 0; b < num_bounces; b++) {
int tri_id = -1;
float cd = -1.0f;
// get closest intersect from all triangles(no bvh for now)
for (int p = 0; p < passes_needed; p++) {
const intersect_return temp = get_closest_intersect_in_load(p, r);
if (temp.triangle_index != -1 && (cd < 0.0f || temp.dist_from_origin < cd)) {
ret = temp;
cd = ret.dist_from_origin;
tri_id = temp.triangle_index + p * triangles_per_load;
++num_hits;
}
}
if (tri_id == -1) {
break;
}
// bounces ray and does color addition to buffer
m = ((material*)triangle_materials)[tri_id];
r = reflect_ray(r, ret.nv, ret.intersect, m.roughness, iteration);
c = m.c * c;
if (m.brightness > 0.0f) {
break;
}
}
if (num_hits == 0) { return; }
// divide color by total num ints
((color*)screen_buffer)[idx] = (((color*)screen_buffer)[idx] + c * m.brightness);
}
// copying func
void copyScreenBuffer(color* c) {
cudaMemcpyFromSymbol(c, screen_buffer, sizeof(color) * scr_w * scr_h);
}
// tri cpu
void add_triangle(triangle t, int idx, material m) {
cudaMemcpyToSymbol(triangles, &t, sizeof(triangle), idx * sizeof(triangle));
cudaMemcpyToSymbol(triangle_materials, &m, sizeof(material), idx * sizeof(material));
}
void initialize_cube(float side_length, vec3 origin, material m, int idx) {
vec3 vertices[] = {
vec3(origin.x, origin.y, origin.z), vec3(origin.x + side_length, origin.y, origin.z), vec3(origin.x + side_length, origin.y + side_length, origin.z), vec3(origin.x, origin.y + side_length, origin.z), // Bottom vertices
vec3(origin.x, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y + side_length, origin.z + side_length), vec3(origin.x, origin.y + side_length, origin.z + side_length) // Top vertices
};
add_triangle(triangle(vertices[0], vertices[1], vertices[2], false), idx, m);
add_triangle(triangle(vertices[0], vertices[2], vertices[3], false), idx+1, m);
add_triangle(triangle(vertices[4], vertices[5], vertices[6], false), idx+2, m);
add_triangle(triangle(vertices[4], vertices[6], vertices[7], false), idx+3, m);
add_triangle(triangle(vertices[0], vertices[1], vertices[5], false), idx+4, m);
add_triangle(triangle(vertices[0], vertices[5], vertices[4], false), idx+5, m);
add_triangle(triangle(vertices[2], vertices[3], vertices[7], false), idx+6, m);
add_triangle(triangle(vertices[2], vertices[7], vertices[6], false), idx+7, m);
add_triangle(triangle(vertices[0], vertices[3], vertices[7], false), idx+8, m);
add_triangle(triangle(vertices[0], vertices[7], vertices[4], false), idx+9, m);
add_triangle(triangle(vertices[1], vertices[2], vertices[6], false), idx+10, m);
add_triangle(triangle(vertices[1], vertices[6], vertices[5], false), idx+11, m);
}
void initialize_cubeLight(float side_length, vec3 origin, material m1, material m2, int idx) {
vec3 vertices[] = {
vec3(origin.x, origin.y, origin.z), vec3(origin.x + side_length, origin.y, origin.z), vec3(origin.x + side_length, origin.y + side_length, origin.z), vec3(origin.x, origin.y + side_length, origin.z), // Bottom vertices
vec3(origin.x, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y + side_length, origin.z + side_length), vec3(origin.x, origin.y + side_length, origin.z + side_length) // Top vertices
};
// Bottom
add_triangle(triangle(vertices[0], vertices[1], vertices[2], false), idx, m1);
add_triangle(triangle(vertices[0], vertices[2], vertices[3], false), idx + 1, m1);
// Top (roof)
add_triangle(triangle(vertices[3], vertices[2], vertices[6], false), idx + 2, m1);
add_triangle(triangle(vertices[3], vertices[6], vertices[7], false), idx + 3, m1);
// Sides
add_triangle(triangle(vertices[0], vertices[1], vertices[5], false), idx + 4, m2);
add_triangle(triangle(vertices[0], vertices[5], vertices[4], false), idx + 5, m2);
add_triangle(triangle(vertices[2], vertices[1], vertices[5], false), idx + 6, m1);
add_triangle(triangle(vertices[2], vertices[5], vertices[6], false), idx + 7, m1);
add_triangle(triangle(vertices[0], vertices[3], vertices[7], false), idx + 8, m1);
add_triangle(triangle(vertices[0], vertices[7], vertices[4], false), idx + 9, m1);
add_triangle(triangle(vertices[4], vertices[5], vertices[6], false), idx + 10, m1);
add_triangle(triangle(vertices[4], vertices[6], vertices[7], false), idx + 11, m1);
}
void initialize_cubeCornell(float side_length, vec3 origin, int idx) {
vec3 vertices[] = {
vec3(origin.x, origin.y, origin.z), vec3(origin.x + side_length, origin.y, origin.z), vec3(origin.x + side_length, origin.y + side_length, origin.z), vec3(origin.x, origin.y + side_length, origin.z), // Bottom vertices
vec3(origin.x, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y, origin.z + side_length), vec3(origin.x + side_length, origin.y + side_length, origin.z + side_length), vec3(origin.x, origin.y + side_length, origin.z + side_length) // Top vertices
};
// Bottom
add_triangle(triangle(vertices[0], vertices[1], vertices[2], false), idx, material(color(0.0f, 0.0f, 0.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[0], vertices[2], vertices[3], false), idx + 1, material(color(0.0f, 0.0f, 0.0f), 0.0f, 0.65f));
// Top (roof)
add_triangle(triangle(vertices[3], vertices[2], vertices[6], false), idx + 2, material(color(1.0f, 1.0f, 1.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[3], vertices[6], vertices[7], false), idx + 3, material(color(1.0f, 1.0f, 1.0f), 0.0f, 0.65f));
// Sides
add_triangle(triangle(vertices[0], vertices[1], vertices[5], false), idx + 4, material(color(1.0f, 1.0f, 1.0f), 2.0f, 0.65f));
add_triangle(triangle(vertices[0], vertices[5], vertices[4], false), idx + 5, material(color(1.0f, 1.0f, 1.0f), 2.0f, 0.65f));
add_triangle(triangle(vertices[2], vertices[1], vertices[5], false), idx + 6, material(color(0.0f, 1.0f, 0.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[2], vertices[5], vertices[6], false), idx + 7, material(color(0.0f, 1.0f, 0.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[0], vertices[3], vertices[7], false), idx + 8, material(color(1.0f, 0.0f, 0.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[0], vertices[7], vertices[4], false), idx + 9, material(color(1.0f, 0.0f, 0.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[4], vertices[5], vertices[6], false), idx + 10, material(color(0.0f, 0.0f, 1.0f), 0.0f, 0.65f));
add_triangle(triangle(vertices[4], vertices[6], vertices[7], false), idx + 11, material(color(0.0f, 0.0f, 1.0f), 0.0f, 0.65f));
}
bool read_stl(const std::string& filename, material m, int start_idx) {
std::ifstream file(filename, std::ios::binary);
if (!file) {
std::cerr << "Failed to open file: " << filename << std::endl;
return false;
}
// Read the 80-byte header (skip it)
file.seekg(80, std::ios::beg);
// Read the number of triangles in the STL file
unsigned int numTriangles;
file.read(reinterpret_cast<char*>(&numTriangles), sizeof(numTriangles));
// Read each triangle
for (unsigned int i = 0; i < numTriangles; ++i) {
// Skip normal vector (3 floats)
float normal[3];
file.read(reinterpret_cast<char*>(&normal), sizeof(normal));
// Read the 3 vertices of the triangle
triangle t;
file.read(reinterpret_cast<char*>(&t.p1.x), sizeof(float)); // p1.x
file.read(reinterpret_cast<char*>(&t.p1.y), sizeof(float)); // p1.y
file.read(reinterpret_cast<char*>(&t.p1.z), sizeof(float)); // p1.z
file.read(reinterpret_cast<char*>(&t.p2.x), sizeof(float)); // p2.x
file.read(reinterpret_cast<char*>(&t.p2.y), sizeof(float)); // p2.y
file.read(reinterpret_cast<char*>(&t.p2.z), sizeof(float)); // p2.z
file.read(reinterpret_cast<char*>(&t.p3.x), sizeof(float)); // p3.x
file.read(reinterpret_cast<char*>(&t.p3.y), sizeof(float)); // p3.y
file.read(reinterpret_cast<char*>(&t.p3.z), sizeof(float)); // p3.z
// Skip attribute byte count (2 bytes)
unsigned short attribute;
file.read(reinterpret_cast<char*>(&attribute), sizeof(attribute));
// Call add_triangle with the triangle and index
add_triangle(t, start_idx + i, m);
}
file.close();
return true;
}
// Function to read an ASCII STL file and add triangles
bool read_ascii_stl(const std::string& filename, material m, int start_idx) {
std::ifstream file(filename);
if (!file) {
std::cerr << "Failed to open file: " << filename << std::endl;
return false;
}
std::string line;
int idx = start_idx;
while (std::getline(file, line)) {
if (line.find("facet normal") != std::string::npos) {
// Skip the normal line
std::getline(file, line);
// Read the 3 vertices of the triangle
triangle t;
for (int i = 0; i < 3; ++i) {
std::getline(file, line);
std::stringstream ss(line);
std::string temp;
ss >> temp >> t.p1.x >> t.p1.y >> t.p1.z;
}
// Skip the endfacet line
std::getline(file, line);
// Call add_triangle with the triangle and index
add_triangle(t, idx++, m);
}
}
file.close();
return true;
}
// The wrapper function to determine the file type (binary or ASCII) and process the STL file accordingly
bool process_stl_file(const std::string& filename, material m, int start_idx) {
std::ifstream file(filename, std::ios::binary);
if (!file) {
std::cerr << "Failed to open file: " << filename << std::endl;
return false;
}
char header[5];
file.read(header, 5);
file.close();
// Check if file is binary or ASCII based on the header
if (header[0] == 's' && header[1] == 'o' && header[2] == 'l' && header[3] == 'i' && header[4] == 'd') {
// Likely ASCII STL, so use the ASCII parser
return read_ascii_stl(filename, m, start_idx);
}
else {
// Binary STL, so use the binary parser
return read_stl(filename, m, start_idx);
}
}
// zero kernel
__global__ void zeroBuffer() {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
((color*)screen_buffer)[idx] = color(0.0f, 0.0f, 0.0f);
}
// div colors stuff
__global__ void divBuffer() {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
color c = ((color*)screen_buffer)[idx] * (1.0f/num_frames);
c.r = c.r > 1.0f ? 1.0f : c.r;
c.g = c.g > 1.0f ? 1.0f : c.g;
c.b = c.b > 1.0f ? 1.0f : c.b;
((color*)screen_buffer)[idx] = c;
}
int main() {
cudaFuncSetCacheConfig(updateKernel, cudaFuncCachePreferL1);
zeroBuffer << <256, scr_w* scr_h / 256 >> > ();
initialize_cube(512.0f, vec3(-256.0f, -256.0f, -10.0f), material(color(0.1f, 0.1f, 1.0f), 0.0f, 0.7f), 0);
initialize_cube(50.0f, vec3(-25.0f, -270.0f, 100.0f), material(color(1.0f, 1.0f, 1.0f), 100.0f, 0.0f), 12);
initialize_cube(100.0f, vec3(-100.0f, 150.0f, 150.0f), material(color(1.0f, 0.1f, 0.1f), 0.0f, 0.7f), 24);
//initialize_cube(60.0f, vec3(-100.0f, 100.0f, 60.0f), material(color(1.0f, 0.0f, 0.0f), 0.0f, 1.0f), 24);
//add_triangle(triangle(vec3(-11.0f, 82.0f, 3.0f), vec3(-12.0f, 80.0f, 7.0f), vec3(-3.0f, 88.0f, 7.0f), false), 13, material(color(1.0f, 1.0f, 1.0f), 1.0f, 0.0f));
//readStlModelAndAddTriangles(, material(color(1.0f, 1.0f, 1.0f), 1.0f, 0.0f));
//process_stl_file("C:\\Users\\david\\Downloads\\pythonAndModels\\Knight.stl", material(color(1.0f, 1.0f, 1.0f), 1.0f, 0.0f), 0);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int f = 0; f < num_frames; f++) {
updateKernel << <threads_main, blocks_main >> > (f);
}
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
divBuffer << <256, scr_w* scr_h / 256 >> > ();
color* sc = (color*)malloc(sizeof(color) * scr_w * scr_h);
copyScreenBuffer(sc);
cudaDeviceSynchronize();
saveBMP("out.bmp", scr_w, scr_h, sc);
cudaError_t e = cudaGetLastError();
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("kernel calls took %f miliseconds\n", elapsedTime);
printf("Kernel exited with error: %s\n", cudaGetErrorString(e));
free(sc);
cudaFree(screen_buffer);
}