diff --git a/mmclab/example/demo_photon_sharing.m b/mmclab/example/demo_photon_sharing.m index c8d5dc8..15c5960 100644 --- a/mmclab/example/demo_photon_sharing.m +++ b/mmclab/example/demo_photon_sharing.m @@ -16,7 +16,7 @@ cfg.tstart = 0; cfg.tend = 5e-9; cfg.tstep = 1e-10; -cfg.gpuid = -1; +cfg.gpuid = 1; cfg.method = 'elem'; cfg.debuglevel = 'TP'; @@ -30,7 +30,7 @@ pat(16:25, :, 5) = 1; pat(:, 16:25, 5) = 1; % pattern with a bright cross -cfg.srcpattern = pat; +cfg.srcpattern = permute(pat, [3 1 2]); %% run the simulation diff --git a/mmclab/mmclab.m b/mmclab/mmclab.m index f6781e5..22c6538 100644 --- a/mmclab/mmclab.m +++ b/mmclab/mmclab.m @@ -105,8 +105,8 @@ % by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3) % 'pattern' - a 3D quadrilateral pattern illumination, same as above, except % srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions, -% and srcpattern is a floating-point pattern array, with values between [0-1]. -% if cfg.srcnum>1, srcpattern must be a floating-point array with +% and srcpattern is a double-precision pattern array, with values between [0-1]. +% if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with % a dimension of [srcnum srcparam1(4) srcparam2(4)] % Example: % 'fourier' - spatial frequency domain source, similar to 'planar', except diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c index cc23c7f..4879f26 100644 --- a/src/mmc_cl_host.c +++ b/src/mmc_cl_host.c @@ -92,7 +92,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/ cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/ - cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing + cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer cl_float* field, *dref = NULL; @@ -103,6 +103,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { char opt[MAX_PATH_LENGTH + 1] = {'\0'}; cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1; cl_uint hostdetreclen = detreclen + 1; + int sharedmemsize = 0; + GPUInfo* gpu = NULL; float4* propdet; @@ -122,7 +124,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout, cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}}, mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}}, - cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, + cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, cfg->seed, cfg->maxjumpdebug }; @@ -138,6 +140,16 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__); } + if (cfg->issavedet) { + sharedmemsize = sizeof(cl_float) * detreclen; + } + + param.reclen = sharedmemsize / sizeof(float); + + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sharedmemsize += sizeof(cl_float) * cfg->srcnum; + } + cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; /* Use NULL for backward compatibility */ @@ -324,9 +336,9 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status))); if (cfg->srctype == MCX_SRC_PATTERN) { - OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status))); + OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status))); } else if (cfg->srctype == MCX_SRC_PATTERN3D) { - OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status))); + OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status))); } else { gsrcpattern[i] = NULL; } @@ -417,6 +429,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL"); } + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING"); + } + if (gpu[0].vendor == dvNVIDIA) { sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU"); } @@ -497,7 +513,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) { OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons))); //OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i)))); - OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL))); + OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autothread, NULL))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i)))); OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i)))); @@ -708,7 +724,7 @@ is more than what your have specified (%d), please use the -H option to specify mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU - field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; } free(rawfield); @@ -732,9 +748,6 @@ is more than what your have specified (%d), please use the -H option to specify }// iteration }// time gates - fieldlen = (fieldlen >> cfg->nbuffer); - field = realloc(field, sizeof(field[0]) * fieldlen); - if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { for (i = 0; i < fieldlen; i++) { diff --git a/src/mmc_cl_host.h b/src/mmc_cl_host.h index 2736392..e1367ef 100644 --- a/src/mmc_cl_host.h +++ b/src/mmc_cl_host.h @@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig { cl_int e0; cl_int isextdet; cl_int framelen; - cl_uint nbuffer; cl_uint maxpropdet; cl_uint normbuf; cl_int issaveseed; diff --git a/src/mmc_core.cl b/src/mmc_core.cl index ef8d641..975c09a 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -239,6 +239,7 @@ inline __device__ __host__ float4 convert_float4_rte(float4 v) { #define R_C0 3.335640951981520e-12f //1/C0 in s/mm #define JUST_ABOVE_ONE 1.0001f //test for boundary +#define JUST_BELOW_ONE 0.9998f /**< test for boundary */ #define SAME_VOXEL -9999.f //scatter within a voxel #define NO_LAUNCH 9999 //when fail to launch, for debug @@ -283,8 +284,8 @@ typedef struct MMC_Ray { uint oldidx; float oldweight; unsigned int posidx; /**< launch position index of the photon for pattern source type */ - //int nexteid; /**< the index to the neighboring tet to be moved into */ - //float4 bary0; /**< the Barycentric coordinate of the intersection with the tet */ + //int nexteid; /**< the index to the neighboring tet to be moved into */ + //float4 bary0; /**< the Barycentric coordinate of the intersection with the tet */ float slen0; /**< initial unitless scattering length = length*mus */ unsigned int photonid; /**< index of the current photon */ } ray __attribute__ ((aligned (16))); @@ -301,13 +302,13 @@ typedef struct MMC_Parameter { uint maxmedia; uint detnum; int voidtime; - int srctype; /**< type of the source */ - float4 srcparam1; /**< source parameters set 1 */ - float4 srcparam2; /**< source parameters set 2 */ - uint issaveref; /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/ + int srctype; /**< type of the source */ + float4 srcparam1; /**< source parameters set 1 */ + float4 srcparam2; /**< source parameters set 2 */ + uint issaveref; /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/ uint maxgate; - uint debuglevel; /**< debug flags */ - int reclen; /**< record (4-byte per record) number per detected photon */ + uint debuglevel; /**< debug flags */ + int reclen; /**< record (4-byte per record) number per detected photon, does not include srcnum buffer for photon-sharing */ int outputtype; int elemlen; int mcmethod; @@ -325,12 +326,11 @@ typedef struct MMC_Parameter { int e0; int isextdet; int framelen; - uint nbuffer; uint maxpropdet; uint normbuf; int issaveseed; int seed; - uint maxjumpdebug; /**< max number of positions to be saved to save photon trajectory when -D M is used */ + uint maxjumpdebug; /**< max number of positions to be saved to save photon trajectory when -D M is used */ } MCXParam __attribute__ ((aligned (16))); typedef struct MMC_Reporter { @@ -339,10 +339,10 @@ typedef struct MMC_Reporter { } MCXReporter __attribute__ ((aligned (16))); typedef struct MCX_medium { - float mua; /**x; - n_det[baseaddr++] = p0->y; - n_det[baseaddr++] = p0->z; - n_det[baseaddr++] = v->x; - n_det[baseaddr++] = v->y; - n_det[baseaddr++] = v->z; + n_det[baseaddr++] = r->p0.x; + n_det[baseaddr++] = r->p0.y; + n_det[baseaddr++] = r->p0.z; + n_det[baseaddr++] = r->v.x; + n_det[baseaddr++] = r->v.y; + n_det[baseaddr++] = r->v.z; } n_det[baseaddr++] = ppath[GPU_PARAM(gcfg, reclen) - 1]; // save partial pathlength to the memory @@ -600,7 +582,7 @@ __device__ void savedebugdata(ray* r, uint id, __global MCXReporter* reporter, _ * \param[out] visit: statistics counters of this thread */ -__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __global int* elem, __global float* weight, +__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __local float* ppath, __global int* elem, __global float* weight, int type, __global int* facenb, __global float4* normal, __constant Medium* gmed, __global float* replayweight, __global float* replaytime) { float Lmin; @@ -719,20 +701,41 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->oldweight > 0.f) { + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { #ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + float oldval = atomicadd(weight + r->oldidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx, oldval); + } else { + atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + } } - } #else - weight[r->oldidx] += r->oldweight; + weight[r->oldidx] += r->oldweight; +#endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } } #endif @@ -745,29 +748,54 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->faceid == -2 || !r->isend) { + + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + #ifdef USE_ATOMIC - float oldval = atomicadd(weight + newidx, r->oldweight); + float oldval = atomicadd(weight + newidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx, -oldval) < 0.0f) { + atomicadd(weight + newidx, oldval); + } else { + atomicadd(weight + newidx + gcfg->crop0.w, oldval); + } } - } #else - weight[newidx] += r->oldweight; + weight[newidx] += r->oldweight; +#endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } + r->oldweight = 0.f; } -#endif +#endif // for ifdef DO_NOT_SAVE #ifdef __NVCC__ } -#endif -#endif +#endif // for if (GPU_PARAM(gcfg, method) == rtBLBadouel +#endif // for ifdef USE_BLBADOUEL + #ifdef USE_DMMC #ifdef __NVCC__ @@ -801,21 +829,45 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ if (newidx != r->oldidx) { #ifndef DO_NOT_SAVE -#ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx, oldval); + } else { + atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + } } - } #else - weight[r->oldidx] += r->oldweight; + weight[r->oldidx] += r->oldweight; #endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; #endif + } + + } + +#endif // for ifdef DO_NOT_SAVE r->oldidx = newidx; r->oldweight = S.w * totalloss; } else { @@ -825,24 +877,49 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ #ifndef DO_NOT_SAVE if (r->faceid == -2 || !r->isend) { -#ifdef USE_ATOMIC - float oldval = atomicadd(weight + r->oldidx, r->oldweight); - if (oldval > MAX_ACCUM) { - if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) { - atomicadd(weight + r->oldidx, oldval); - } else { - atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval); + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx, r->oldweight); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx, -oldval) < 0.0f) { + atomicadd(weight + newidx, oldval); + } else { + atomicadd(weight + newidx + gcfg->crop0.w, oldval); + } } - } #else - weight[newidx] += r->oldweight; + weight[newidx] += r->oldweight; #endif + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + + for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) { +#ifdef USE_ATOMIC + float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]); + + if (oldval > MAX_ACCUM) { + if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval); + } else { + atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval); + } + } + +#else + weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]; +#endif + } + + } + r->oldweight = 0.f; } -#endif +#endif // for ifdef DO_NOT_SAVE + #ifndef __NVCC__ S.w *= T.w; S.xyz += T.xyz; @@ -855,8 +932,8 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __ } #endif -#endif -#endif +#endif // for USE_DMMC +#endif // for MCX_SKIP_VOLUME } r->p0 = r->p0 + FL3(r->Lmove) * r->vec; @@ -1045,7 +1122,7 @@ __device__ void fixphoton(float3* p, __global float3* nodes, __global int* ee) { * \param[in,out] ran: the random number generator states */ -__device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) { +__device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) { int canfocus = 1; float3 origin = r->p0; @@ -1078,8 +1155,9 @@ __device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3* #endif int xsize = (int)gcfg->srcparam1.w; int ysize = (int)gcfg->srcparam2.w; - r->posidx = MIN((int)(ry * ysize), ysize - 1) * xsize + MIN((int)(rx * xsize), xsize - 1); - r->weight = srcpattern[MIN( (int)(ry * gcfg->srcparam2.w), (int)gcfg->srcparam2.w - 1 ) * (int)(gcfg->srcparam1.w) + MIN( (int)(rx * gcfg->srcparam1.w), (int)gcfg->srcparam1.w - 1 )]; + r->posidx = MIN((int)(ry * JUST_BELOW_ONE * ysize), ysize - 1) * xsize + MIN((int)(rx * JUST_BELOW_ONE * xsize), xsize - 1); + r->weight = (GPU_PARAM(gcfg, srcnum) > 1) ? 1.f : srcpattern[r->posidx]; + #endif #if defined(__NVCC__) || defined(MCX_SRC_FOURIER) // need to prevent rx/ry=1 here #ifdef __NVCC__ @@ -1385,17 +1463,28 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #endif /*initialize the photon parameters*/ - launchphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); + launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); *energytot += r.weight; #ifdef MCX_SAVE_DETECTORS if (GPU_PARAM(gcfg, issavedet)) { clearpath(ppath, GPU_PARAM(gcfg, reclen)); /*clear shared memory for saving history of a new photon*/ - ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ + + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { + ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ + } else if (GPU_PARAM(gcfg, srctype) == stPattern) { + *((uint*)(ppath + GPU_PARAM(gcfg, reclen) - 1)) = r.posidx; + } } #endif + if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) { + for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) { + ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid]; + } + } + if (GPU_PARAM(gcfg, debuglevel) & dlTraj) { savedebugdata(&r, id, reporter, gdebugdata, gcfg); } @@ -1404,7 +1493,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP /*http://stackoverflow.com/questions/2148149/how-to-sum-a-large-number-of-float-number*/ while (1) { /*propagate a photon until exit*/ - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; if (r.pout.x == MMC_UNDEFINED) { @@ -1477,7 +1566,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP GPUDEBUG(("P %f %f %f %d %u %e\n", r.pout.x, r.pout.y, r.pout.z, r.eid, id, r.slen)); } - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; #ifdef MCX_SAVE_DETECTORS @@ -1495,7 +1584,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP while (r.pout.x == MMC_UNDEFINED && fixcount++ < MAX_TRIAL) { fixphoton(&r.p0, node, (__global int*)(elem + (r.eid - 1)*GPU_PARAM(gcfg, elemlen))); - r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); + r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; #ifdef MCX_SAVE_DETECTORS @@ -1550,17 +1639,17 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #ifdef MCX_SAVE_SEED if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, initseed); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, initseed); } else { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, initseed); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, initseed); } #else if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, NULL); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, NULL); } else { - savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, NULL); + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, NULL); } #endif @@ -1643,7 +1732,8 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton, t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j]; } - onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0)*GPU_PARAM(gcfg, reclen), gcfg, node, elem, + onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) * + (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem, weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet, srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata); } diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu index c4b7b7d..e50a1e6 100644 --- a/src/mmc_cu_host.cu +++ b/src/mmc_cu_host.cu @@ -224,8 +224,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo float* greplayweight = NULL, *greplaytime = NULL; MCXReporter* greporter; - uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) - << cfg->nbuffer; // use 4 copies to reduce racing + uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer float* field, *dref = NULL; @@ -237,6 +236,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1; uint hostdetreclen = detreclen + 1; + // launch mcxkernel + size_t sharedmemsize = 0; MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z), make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z), @@ -280,7 +281,6 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo cfg->e0, cfg->isextdet, (int)meshlen, - cfg->nbuffer, (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum, (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/ cfg->issaveseed, @@ -290,6 +290,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo MCXReporter reporter = {0.f, 0}; + if (cfg->issavedet) { + sharedmemsize = sizeof(float) * detreclen; + } + + param.reclen = sharedmemsize / sizeof(float); //< the shared memory buffer length associated with detected photon + + if (cfg->srctype == stPattern && cfg->srcnum > 1) { + sharedmemsize += sizeof(float) * cfg->srcnum; + } + + sharedmemsize *= ((int)gpu[gpuid].autoblock); #ifdef _OPENMP threadid = omp_get_thread_num(); @@ -502,17 +513,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo if (cfg->srctype == MCX_SRC_PATTERN) { CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern, - sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w))); + sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum))); CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern, - sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), + sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cudaMemcpyHostToDevice)); } else if (cfg->srctype == MCX_SRC_PATTERN3D) { CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y* - cfg->srcparam1.z))); + cfg->srcparam1.z * cfg->srcnum))); CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y* - cfg->srcparam1.z), + cfg->srcparam1.z * cfg->srcnum), cudaMemcpyHostToDevice)); } else { gsrcpattern = NULL; @@ -591,14 +602,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo param.tstart = twindow0; param.tend = twindow1; - // launch mcxkernel - size_t sharedMemSize = sizeof(int); - - if (cfg->issavedet) { - sharedMemSize = sizeof(float) * ((int)gpu[gpuid].autoblock) * detreclen; - } - mmc_main_loop <<< mcgrid, mcblock, sharedMemSize>>>( + mmc_main_loop <<< mcgrid, mcblock, sharedmemsize>>>( threadphoton, oddphotons, gnode, (int*)gelem, gweight, gdref, gtype, (int*)gfacenb, gsrcelem, gnormal, gdetphoton, gdetected, gseed, (int*)gprogress, genergy, greporter, @@ -752,7 +757,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio mcx_fflush(cfg->flog); for (i = 0; i < fieldlen; i++) { // accumulate field, can be done in the GPU - field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; + field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen]; } free(rawfield); @@ -780,8 +785,6 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio #pragma omp master { int i, j; - fieldlen = (fieldlen >> cfg->nbuffer); - field = (float*)realloc(field, sizeof(field[0]) * fieldlen); if (cfg->exportfield) { if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) { diff --git a/src/mmc_raytrace.c b/src/mmc_raytrace.c index b0bea51..1a0e5d4 100644 --- a/src/mmc_raytrace.c +++ b/src/mmc_raytrace.c @@ -386,12 +386,11 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { #pragma omp atomic tracer->mesh->weight[eid + tshift] += ww; } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -437,13 +436,12 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { tracer->mesh->weight[ee[i] - 1 + tshift] += ww * (baryp0[i] + baryout[i]); } } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (i = 0; i < 4; i++) { #pragma omp atomic - tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx] * (baryp0[i] + baryout[i]); + tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx] * (baryp0[i] + baryout[i]); } } } @@ -716,12 +714,11 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { #pragma omp atomic tracer->mesh->weight[eid + tshift] += ww; } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1 - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } else { @@ -734,13 +731,12 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) { tracer->mesh->weight[ee[j] - 1 + tshift] += barypout[j]; } } else if (cfg->srctype == stPattern) { - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (j = 0; j < 4; j++) { #pragma omp atomic - tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1405,7 +1401,6 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito medium* prop; int* enb, *ee = (int*)(tracer->mesh->elem + eid * tracer->mesh->elemlen); float mus; - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern int pidx; // pattern index if (cfg->implicit == 1 && r->inroi && tracer->mesh->edgeroi && fabs(tracer->mesh->edgeroi[eid * 6]) < EPS) { @@ -1526,7 +1521,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito } else if (cfg->srctype == stPattern) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } @@ -1543,7 +1538,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito } else if (cfg->srctype == stPattern) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } @@ -1577,7 +1572,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1596,7 +1591,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (pidx = 0; pidx < cfg->srcnum; pidx++) { #pragma omp atomic - tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1621,7 +1616,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito for (pidx = 0; pidx < cfg->srcnum; pidx++) { for (i = 0; i < 3; i++) { #pragma omp atomic - tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx]; + tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx]; } } } @@ -1761,13 +1756,11 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg, } else if (cfg->srctype == stPattern) { *((int*)(r.partialpath + visit->reclen - 2)) = r.posidx; - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; - for (pidx = 0; pidx < cfg->srcnum; pidx++) { if (cfg->seed == SEED_FROM_FILE && (cfg->outputtype == otWL || cfg->outputtype == otWP)) { kahany = cfg->replayweight[r.photonid] - visit->kahanc0[pidx]; /* when replay mode, accumulate detected photon weight */ } else { - kahany = r.weight * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc0[pidx]; + kahany = r.weight * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc0[pidx]; } kahant = visit->launchweight[pidx] + kahany; @@ -2033,10 +2026,8 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg, visit->kahanc1[0] = (kahant - visit->absorbweight[0]) - kahany; visit->absorbweight[0] = kahant; } else if (cfg->srctype == stPattern) { - int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; - for (pidx = 0; pidx < cfg->srcnum; pidx++) { - kahany = r.Eabsorb * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc1[pidx]; + kahany = r.Eabsorb * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc1[pidx]; kahant = visit->absorbweight[pidx] + kahany; visit->kahanc1[pidx] = (kahant - visit->absorbweight[pidx]) - kahany; visit->absorbweight[pidx] = kahant; diff --git a/src/mmc_utils.c b/src/mmc_utils.c index 9a8192d..aab0d51 100644 --- a/src/mmc_utils.c +++ b/src/mmc_utils.c @@ -326,7 +326,6 @@ void mcx_initcfg(mcconfig* cfg) { cfg->energyesc = 0.f; cfg->runtime = 0; cfg->autopilot = 1; - cfg->nbuffer = 0; cfg->gpuid = 0; cfg->maxjumpdebug = 10000000; cfg->exportdebugdata = NULL; @@ -3216,8 +3215,6 @@ void mcx_parsecmd(int argc, char* argv[], mcconfig* cfg) { i = mcx_readarg(argc, argv, i, &(cfg->debugphoton), "int"); } else if (strcmp(argv[i] + 2, "maxjumpdebug") == 0) { i = mcx_readarg(argc, argv, i, &(cfg->maxjumpdebug), "int"); - } else if (strcmp(argv[i] + 2, "buffer") == 0) { - i = mcx_readarg(argc, argv, i, &(cfg->nbuffer), "int"); } else if (strcmp(argv[i] + 2, "gridsize") == 0) { i = mcx_readarg(argc, argv, i, &(cfg->steps.x), "float"); cfg->steps.y = cfg->steps.x; diff --git a/src/mmc_utils.h b/src/mmc_utils.h index 6086cd4..0e0487a 100644 --- a/src/mmc_utils.h +++ b/src/mmc_utils.h @@ -281,7 +281,6 @@ typedef struct MMC_config { unsigned int runtime; /**photonseed != NULL) { return; @@ -778,25 +777,26 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig* printf("mmc.session='%s';\n", cfg->session); } else if (strcmp(name, "srcpattern") == 0) { arraydim = mxGetDimensions(item); + dimtype dimz = 1; + + if (mxGetNumberOfDimensions(item) == 3) { + dimz = arraydim[2]; + cfg->srcnum = arraydim[0]; + } + double* val = mxGetPr(item); if (cfg->srcpattern) { free(cfg->srcpattern); } - if (mxGetNumberOfDimensions(item) == 3) { - cfg->srcnum = arraydim[2]; - } else { - cfg->srcnum = 1; - } - - cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * cfg->srcnum * sizeof(float)); + cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * dimz * sizeof(float)); - for (k = 0; k < arraydim[0]*arraydim[1]*cfg->srcnum; k++) { - cfg->srcpattern[k] = val[k]; + for (i = 0; i < arraydim[0]*arraydim[1]*dimz; i++) { + cfg->srcpattern[i] = val[i]; } - printf("mmc.srcpattern=[%d %d %d];\n", (int)arraydim[0], (int)arraydim[1], cfg->srcnum); + printf("mmc.srcpattern=[%ld %ld %ld];\n", arraydim[0], arraydim[1], dimz); } else if (strcmp(name, "method") == 0) { int len = mxGetNumberOfElements(item); const char* methods[] = {"plucker", "havel", "badouel", "elem", "grid", ""};