diff --git a/mmclab/example/demo_photon_sharing.m b/mmclab/example/demo_photon_sharing.m
index c8d5dc8..15c5960 100644
--- a/mmclab/example/demo_photon_sharing.m
+++ b/mmclab/example/demo_photon_sharing.m
@@ -16,7 +16,7 @@
 cfg.tstart = 0;
 cfg.tend = 5e-9;
 cfg.tstep = 1e-10;
-cfg.gpuid = -1;
+cfg.gpuid = 1;
 cfg.method = 'elem';
 cfg.debuglevel = 'TP';
 
@@ -30,7 +30,7 @@
 pat(16:25, :, 5) = 1;
 pat(:, 16:25, 5) = 1;     % pattern with a bright cross
 
-cfg.srcpattern = pat;
+cfg.srcpattern = permute(pat, [3 1 2]);
 
 %% run the simulation
 
diff --git a/mmclab/mmclab.m b/mmclab/mmclab.m
index f6781e5..22c6538 100644
--- a/mmclab/mmclab.m
+++ b/mmclab/mmclab.m
@@ -105,8 +105,8 @@
 %                                by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3)
 %                      'pattern' - a 3D quadrilateral pattern illumination, same as above, except
 %                                srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions,
-%                                and srcpattern is a floating-point pattern array, with values between [0-1].
-%                                if cfg.srcnum>1, srcpattern must be a floating-point array with
+%                                and srcpattern is a double-precision pattern array, with values between [0-1].
+%                                if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with
 %                                a dimension of [srcnum srcparam1(4) srcparam2(4)]
 %                                Example: <demo_photon_sharing.m>
 %                      'fourier' - spatial frequency domain source, similar to 'planar', except
diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c
index cc23c7f..4879f26 100644
--- a/src/mmc_cl_host.c
+++ b/src/mmc_cl_host.c
@@ -92,7 +92,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
     cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata;     /*read-write buffers*/
     cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/
 
-    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
+    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
     cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
 
     cl_float*  field, *dref = NULL;
@@ -103,6 +103,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
     char opt[MAX_PATH_LENGTH + 1] = {'\0'};
     cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1;
     cl_uint hostdetreclen = detreclen + 1;
+    int sharedmemsize = 0;
+
     GPUInfo* gpu = NULL;
     float4* propdet;
 
@@ -122,7 +124,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
         cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
         mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
-        cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
+        cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
         (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
         cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
     };
@@ -138,6 +140,16 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__);
     }
 
+    if (cfg->issavedet) {
+        sharedmemsize = sizeof(cl_float) * detreclen;
+    }
+
+    param.reclen = sharedmemsize / sizeof(float);
+
+    if (cfg->srctype == stPattern && cfg->srcnum > 1) {
+        sharedmemsize += sizeof(cl_float) * cfg->srcnum;
+    }
+
     cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
 
     /* Use NULL for backward compatibility */
@@ -324,9 +336,9 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));
 
         if (cfg->srctype == MCX_SRC_PATTERN) {
-            OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status)));
+            OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status)));
         } else if (cfg->srctype == MCX_SRC_PATTERN3D) {
-            OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status)));
+            OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status)));
         } else {
             gsrcpattern[i] = NULL;
         }
@@ -417,6 +429,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL");
     }
 
+    if (cfg->srctype == stPattern && cfg->srcnum > 1) {
+        sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING");
+    }
+
     if (gpu[0].vendor == dvNVIDIA) {
         sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU");
     }
@@ -497,7 +513,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
         //OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i))));
-        OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL)));
+        OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autothread, NULL)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i))));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i))));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i))));
@@ -708,7 +724,7 @@ is more than what your have specified (%d), please use the -H option to specify
                     mcx_fflush(cfg->flog);
 
                     for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
-                        field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
+                        field[i] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
                     }
 
                     free(rawfield);
@@ -732,9 +748,6 @@ is more than what your have specified (%d), please use the -H option to specify
         }// iteration
     }// time gates
 
-    fieldlen = (fieldlen >> cfg->nbuffer);
-    field = realloc(field, sizeof(field[0]) * fieldlen);
-
     if (cfg->exportfield) {
         if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
             for (i = 0; i < fieldlen; i++) {
diff --git a/src/mmc_cl_host.h b/src/mmc_cl_host.h
index 2736392..e1367ef 100644
--- a/src/mmc_cl_host.h
+++ b/src/mmc_cl_host.h
@@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig {
     cl_int    e0;
     cl_int    isextdet;
     cl_int    framelen;
-    cl_uint   nbuffer;
     cl_uint   maxpropdet;
     cl_uint   normbuf;
     cl_int    issaveseed;
diff --git a/src/mmc_core.cl b/src/mmc_core.cl
index ef8d641..975c09a 100644
--- a/src/mmc_core.cl
+++ b/src/mmc_core.cl
@@ -239,6 +239,7 @@ inline __device__ __host__ float4 convert_float4_rte(float4 v) {
 #define R_C0               3.335640951981520e-12f  //1/C0 in s/mm
 
 #define JUST_ABOVE_ONE     1.0001f                 //test for boundary
+#define JUST_BELOW_ONE     0.9998f                 /**< test for boundary */
 #define SAME_VOXEL         -9999.f                 //scatter within a voxel
 #define NO_LAUNCH          9999                    //when fail to launch, for debug
 
@@ -283,8 +284,8 @@ typedef struct MMC_Ray {
     uint oldidx;
     float oldweight;
     unsigned int posidx;          /**< launch position index of the photon for pattern source type */
-    //int nexteid;                  /**< the index to the neighboring tet to be moved into */
-    //float4 bary0;                 /**< the Barycentric coordinate of the intersection with the tet */
+    //int nexteid;                /**< the index to the neighboring tet to be moved into */
+    //float4 bary0;               /**< the Barycentric coordinate of the intersection with the tet */
     float slen0;                  /**< initial unitless scattering length = length*mus */
     unsigned int photonid;        /**< index of the current photon */
 } ray __attribute__ ((aligned (16)));
@@ -301,13 +302,13 @@ typedef struct MMC_Parameter {
     uint   maxmedia;
     uint   detnum;
     int    voidtime;
-    int    srctype;                    /**< type of the source */
-    float4 srcparam1;                  /**< source parameters set 1 */
-    float4 srcparam2;                  /**< source parameters set 2 */
-    uint   issaveref;     /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/
+    int    srctype;               /**< type of the source */
+    float4 srcparam1;             /**< source parameters set 1 */
+    float4 srcparam2;             /**< source parameters set 2 */
+    uint   issaveref;             /**<1 save diffuse reflectance at the boundary voxels, 0 do not save*/
     uint   maxgate;
-    uint   debuglevel;           /**< debug flags */
-    int    reclen;                 /**< record (4-byte per record) number per detected photon */
+    uint   debuglevel;            /**< debug flags */
+    int    reclen;                /**< record (4-byte per record) number per detected photon, does not include srcnum buffer for photon-sharing */
     int    outputtype;
     int    elemlen;
     int    mcmethod;
@@ -325,12 +326,11 @@ typedef struct MMC_Parameter {
     int    e0;
     int    isextdet;
     int    framelen;
-    uint   nbuffer;
     uint   maxpropdet;
     uint   normbuf;
     int    issaveseed;
     int    seed;
-    uint   maxjumpdebug;         /**< max number of positions to be saved to save photon trajectory when -D M is used */
+    uint   maxjumpdebug;          /**< max number of positions to be saved to save photon trajectory when -D M is used */
 } MCXParam __attribute__ ((aligned (16)));
 
 typedef struct MMC_Reporter {
@@ -339,10 +339,10 @@ typedef struct MMC_Reporter {
 } MCXReporter  __attribute__ ((aligned (16)));
 
 typedef struct MCX_medium {
-    float mua;                     /**<absorption coeff in 1/mm unit*/
-    float mus;                     /**<scattering coeff in 1/mm unit*/
-    float g;                       /**<anisotropy*/
-    float n;                       /**<refractive index*/
+    float mua;                    /**<absorption coeff in 1/mm unit*/
+    float mus;                    /**<scattering coeff in 1/mm unit*/
+    float g;                      /**<anisotropy*/
+    float n;                      /**<refractive index*/
 } Medium __attribute__ ((aligned (16)));
 
 #ifdef __NVCC__
@@ -476,24 +476,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float
 }
 #endif
 
-/*
-
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-inline double atomicadd(__global double *val, const double delta){
-  union {
-  double f;
-  ulong  i;
-  } old, new;
-
-  do{
-     old.f = *val;
-     new.f = old.f + delta;
-  } while (atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i);
-  return old.f;
-}
-*/
 #endif
 
 #endif
@@ -522,7 +504,7 @@ __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCX
 }
 
 __device__ void savedetphoton(__global float* n_det, __global uint* detectedphoton,
-                              __local float* ppath, float3* p0, float3* v, __constant Medium* gmed,
+                              __local float* ppath, ray* r, __constant Medium* gmed,
                               int extdetid, __constant MCXParam* gcfg, __global RandType* photonseed, RandType* initseed) {
     uint detid = (extdetid < 0) ? finddetector(p0, (__constant float4*)gmed, gcfg) : extdetid;
 
@@ -550,12 +532,12 @@ __device__ void savedetphoton(__global float* n_det, __global uint* detectedphot
             }
 
             if (GPU_PARAM(gcfg, issaveexit)) {
-                n_det[baseaddr++] = p0->x;
-                n_det[baseaddr++] = p0->y;
-                n_det[baseaddr++] = p0->z;
-                n_det[baseaddr++] = v->x;
-                n_det[baseaddr++] = v->y;
-                n_det[baseaddr++] = v->z;
+                n_det[baseaddr++] = r->p0.x;
+                n_det[baseaddr++] = r->p0.y;
+                n_det[baseaddr++] = r->p0.z;
+                n_det[baseaddr++] = r->v.x;
+                n_det[baseaddr++] = r->v.y;
+                n_det[baseaddr++] = r->v.z;
             }
 
             n_det[baseaddr++] = ppath[GPU_PARAM(gcfg, reclen) - 1]; // save partial pathlength to the memory
@@ -600,7 +582,7 @@ __device__ void savedebugdata(ray* r, uint id, __global MCXReporter* reporter, _
  * \param[out] visit: statistics counters of this thread
  */
 
-__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __global int* elem, __global float* weight,
+__device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __local float* ppath, __global int* elem, __global float* weight,
         int type, __global int* facenb, __global float4* normal, __constant Medium* gmed, __global float* replayweight, __global float* replaytime) {
 
     float Lmin;
@@ -719,20 +701,41 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 #ifndef DO_NOT_SAVE
 
                     if (r->oldweight > 0.f) {
+                        if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
 #ifdef USE_ATOMIC
-                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
-
-                        if (oldval > MAX_ACCUM) {
-                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
-                                atomicadd(weight + r->oldidx, oldval);
-                            } else {
-                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                            float oldval = atomicadd(weight + r->oldidx, r->oldweight);
+
+                            if (oldval > MAX_ACCUM) {
+                                if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                                    atomicadd(weight + r->oldidx, oldval);
+                                } else {
+                                    atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                                }
                             }
-                        }
 
 #else
-                        weight[r->oldidx] += r->oldweight;
+                            weight[r->oldidx] += r->oldweight;
+#endif
+                        } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
+
+                            for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) {
+#ifdef USE_ATOMIC
+                                float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]);
+
+                                if (oldval > MAX_ACCUM) {
+                                    if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) {
+                                        atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval);
+                                    } else {
+                                        atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval);
+                                    }
+                                }
+
+#else
+                                weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx];
 #endif
+                            }
+
+                        }
                     }
 
 #endif
@@ -745,29 +748,54 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 #ifndef DO_NOT_SAVE
 
                 if (r->faceid == -2 || !r->isend) {
+
+                    if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
+
 #ifdef USE_ATOMIC
-                    float oldval = atomicadd(weight + newidx, r->oldweight);
+                        float oldval = atomicadd(weight + newidx, r->oldweight);
 
-                    if (oldval > MAX_ACCUM) {
-                        if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
-                            atomicadd(weight + r->oldidx, oldval);
-                        } else {
-                            atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                        if (oldval > MAX_ACCUM) {
+                            if (atomicadd(weight + newidx, -oldval) < 0.0f) {
+                                atomicadd(weight + newidx, oldval);
+                            } else {
+                                atomicadd(weight + newidx + gcfg->crop0.w, oldval);
+                            }
                         }
-                    }
 
 #else
-                    weight[newidx] += r->oldweight;
+                        weight[newidx] += r->oldweight;
+#endif
+                    } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
+
+                        for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) {
+#ifdef USE_ATOMIC
+                            float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]);
+
+                            if (oldval > MAX_ACCUM) {
+                                if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) {
+                                    atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval);
+                                } else {
+                                    atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval);
+                                }
+                            }
+
+#else
+                            weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx];
 #endif
+                        }
+
+                    }
+
                     r->oldweight = 0.f;
                 }
 
-#endif
+#endif // for ifdef DO_NOT_SAVE
 #ifdef __NVCC__
             }
 
-#endif
-#endif
+#endif // for if (GPU_PARAM(gcfg, method) == rtBLBadouel
+#endif // for ifdef USE_BLBADOUEL
+
 #ifdef USE_DMMC
 #ifdef __NVCC__
 
@@ -801,21 +829,45 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 
                     if (newidx != r->oldidx) {
 #ifndef DO_NOT_SAVE
-#ifdef USE_ATOMIC
-                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
 
-                        if (oldval > MAX_ACCUM) {
-                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
-                                atomicadd(weight + r->oldidx, oldval);
-                            } else {
-                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                        if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
+
+#ifdef USE_ATOMIC
+                            float oldval = atomicadd(weight + r->oldidx, r->oldweight);
+
+                            if (oldval > MAX_ACCUM) {
+                                if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                                    atomicadd(weight + r->oldidx, oldval);
+                                } else {
+                                    atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                                }
                             }
-                        }
 
 #else
-                        weight[r->oldidx] += r->oldweight;
+                            weight[r->oldidx] += r->oldweight;
 #endif
+                        } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
+
+                            for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) {
+#ifdef USE_ATOMIC
+                                float oldval = atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]);
+
+                                if (oldval > MAX_ACCUM) {
+                                    if (atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) {
+                                        atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval);
+                                    } else {
+                                        atomicadd(weight + r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval);
+                                    }
+                                }
+
+#else
+                                weight[r->oldidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx];
 #endif
+                            }
+
+                        }
+
+#endif // for ifdef DO_NOT_SAVE
                         r->oldidx = newidx;
                         r->oldweight = S.w * totalloss;
                     } else {
@@ -825,24 +877,49 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 #ifndef DO_NOT_SAVE
 
                     if (r->faceid == -2 || !r->isend) {
-#ifdef USE_ATOMIC
-                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
 
-                        if (oldval > MAX_ACCUM) {
-                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
-                                atomicadd(weight + r->oldidx, oldval);
-                            } else {
-                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                        if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
+
+#ifdef USE_ATOMIC
+                            float oldval = atomicadd(weight + newidx, r->oldweight);
+
+                            if (oldval > MAX_ACCUM) {
+                                if (atomicadd(weight + newidx, -oldval) < 0.0f) {
+                                    atomicadd(weight + newidx, oldval);
+                                } else {
+                                    atomicadd(weight + newidx + gcfg->crop0.w, oldval);
+                                }
                             }
-                        }
 
 #else
-                        weight[newidx] += r->oldweight;
+                            weight[newidx] += r->oldweight;
 #endif
+                        } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
+
+                            for (int pidx = 0; pidx < GPU_PARAM(gcfg, srcnum); pidx++) {
+#ifdef USE_ATOMIC
+                                float oldval = atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx]);
+
+                                if (oldval > MAX_ACCUM) {
+                                if (atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, -oldval) < 0.0f) {
+                                        atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx, oldval);
+                                    } else {
+                                        atomicadd(weight + newidx * GPU_PARAM(gcfg, srcnum) + pidx + gcfg->crop0.w, oldval);
+                                    }
+                                }
+
+#else
+                                weight[newidx * GPU_PARAM(gcfg, srcnum) + pidx] += r->oldweight * ppath[GPU_PARAM(gcfg, reclen) + pidx];
+#endif
+                            }
+
+                        }
+
                         r->oldweight = 0.f;
                     }
 
-#endif
+#endif // for ifdef DO_NOT_SAVE
+
 #ifndef __NVCC__
                     S.w *= T.w;
                     S.xyz += T.xyz;
@@ -855,8 +932,8 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
             }
 
 #endif
-#endif
-#endif
+#endif // for USE_DMMC
+#endif // for MCX_SKIP_VOLUME
         }
 
         r->p0 = r->p0 + FL3(r->Lmove) * r->vec;
@@ -1045,7 +1122,7 @@ __device__ void fixphoton(float3* p, __global float3* nodes, __global int* ee) {
  * \param[in,out] ran: the random number generator states
  */
 
-__device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) {
+__device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global float3* node, __global int* elem, __global int* srcelem, __private RandType* ran, __global float* srcpattern) {
     int canfocus = 1;
     float3 origin = r->p0;
 
@@ -1078,8 +1155,9 @@ __device__ void launchphoton(__constant MCXParam* gcfg, ray* r, __global float3*
 #endif
             int xsize = (int)gcfg->srcparam1.w;
             int ysize = (int)gcfg->srcparam2.w;
-            r->posidx = MIN((int)(ry * ysize), ysize - 1) * xsize + MIN((int)(rx * xsize), xsize - 1);
-            r->weight = srcpattern[MIN( (int)(ry * gcfg->srcparam2.w), (int)gcfg->srcparam2.w - 1 ) * (int)(gcfg->srcparam1.w) + MIN( (int)(rx * gcfg->srcparam1.w), (int)gcfg->srcparam1.w - 1 )];
+            r->posidx = MIN((int)(ry * JUST_BELOW_ONE * ysize), ysize - 1) * xsize + MIN((int)(rx * JUST_BELOW_ONE * xsize), xsize - 1);
+            r->weight = (GPU_PARAM(gcfg, srcnum) > 1) ? 1.f : srcpattern[r->posidx];
+
 #endif
 #if defined(__NVCC__) || defined(MCX_SRC_FOURIER)  // need to prevent rx/ry=1 here
 #ifdef __NVCC__
@@ -1385,17 +1463,28 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 #endif
 
     /*initialize the photon parameters*/
-    launchphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern);
+    launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern);
     *energytot += r.weight;
 #ifdef MCX_SAVE_DETECTORS
 
     if (GPU_PARAM(gcfg, issavedet)) {
         clearpath(ppath, GPU_PARAM(gcfg, reclen));  /*clear shared memory for saving history of a new photon*/
-        ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/
+
+        if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
+            ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/
+        } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
+            *((uint*)(ppath + GPU_PARAM(gcfg, reclen) - 1)) = r.posidx;
+        }
     }
 
 #endif
 
+    if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) {
+        for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) {
+            ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid];
+        }
+    }
+
     if (GPU_PARAM(gcfg, debuglevel) & dlTraj) {
         savedebugdata(&r, id, reporter, gdebugdata, gcfg);
     }
@@ -1404,7 +1493,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
     /*http://stackoverflow.com/questions/2148149/how-to-sum-a-large-number-of-float-number*/
 
     while (1) { /*propagate a photon until exit*/
-        r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
+        r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
         (*raytet)++;
 
         if (r.pout.x == MMC_UNDEFINED) {
@@ -1477,7 +1566,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
                 GPUDEBUG(("P %f %f %f %d %u %e\n", r.pout.x, r.pout.y, r.pout.z, r.eid, id, r.slen));
             }
 
-            r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
+            r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
             (*raytet)++;
 #ifdef MCX_SAVE_DETECTORS
 
@@ -1495,7 +1584,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 
             while (r.pout.x == MMC_UNDEFINED && fixcount++ < MAX_TRIAL) {
                 fixphoton(&r.p0, node, (__global int*)(elem + (r.eid - 1)*GPU_PARAM(gcfg, elemlen)));
-                r.slen = branchless_badouel_raytet(&r, gcfg, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
+                r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime);
                 (*raytet)++;
 #ifdef MCX_SAVE_DETECTORS
 
@@ -1550,17 +1639,17 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 #ifdef MCX_SAVE_SEED
 
                     if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) {
-                        savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, initseed);
+                        savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, initseed);
                     } else {
-                        savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, initseed);
+                        savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, initseed);
                     }
 
 #else
 
                     if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) {
-                        savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, oldeid, gcfg, photonseed, NULL);
+                        savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, NULL);
                     } else {
-                        savedetphoton(n_det, detectedphoton, ppath, &(r.p0), &(r.vec), gmed, -1, gcfg, photonseed, NULL);
+                        savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, NULL);
                     }
 
 #endif
@@ -1643,7 +1732,8 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
                 t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j];
             }
 
-        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0)*GPU_PARAM(gcfg, reclen), gcfg, node, elem,
+        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) *
+                  (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
                   weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet,
                   srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata);
     }
diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu
index c4b7b7d..e50a1e6 100644
--- a/src/mmc_cu_host.cu
+++ b/src/mmc_cu_host.cu
@@ -224,8 +224,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     float*  greplayweight = NULL, *greplaytime = NULL;
 
     MCXReporter* greporter;
-    uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne)
-                   << cfg->nbuffer; // use 4 copies to reduce racing
+    uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
     cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
 
     float* field, *dref = NULL;
@@ -237,6 +236,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop +
                      (cfg->issaveexit > 0) * 6 + 1;
     uint hostdetreclen = detreclen + 1;
+    // launch mcxkernel
+    size_t sharedmemsize = 0;
 
     MCXParam param = {make_float3(cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z),
                       make_float3(cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z),
@@ -280,7 +281,6 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
                       cfg->e0,
                       cfg->isextdet,
                       (int)meshlen,
-                      cfg->nbuffer,
                       (uint)(mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
                       (uint)(MIN((MAX_PROP - (mesh->prop + 1 + cfg->isextdet) - cfg->detnum), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
                       cfg->issaveseed,
@@ -290,6 +290,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
 
     MCXReporter reporter = {0.f, 0};
 
+    if (cfg->issavedet) {
+        sharedmemsize = sizeof(float) * detreclen;
+    }
+
+    param.reclen = sharedmemsize / sizeof(float);  //< the shared memory buffer length associated with detected photon
+
+    if (cfg->srctype == stPattern && cfg->srcnum > 1) {
+        sharedmemsize += sizeof(float) * cfg->srcnum;
+    }
+
+    sharedmemsize *= ((int)gpu[gpuid].autoblock);
 
 #ifdef _OPENMP
     threadid = omp_get_thread_num();
@@ -502,17 +513,17 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
 
     if (cfg->srctype == MCX_SRC_PATTERN) {
         CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern,
-                               sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w)));
+                               sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum)));
         CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern,
-                               sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w),
+                               sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum),
                                cudaMemcpyHostToDevice));
     } else if (cfg->srctype == MCX_SRC_PATTERN3D) {
         CUDA_ASSERT(cudaMalloc((void**)&gsrcpattern,
                                sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y*
-                                       cfg->srcparam1.z)));
+                                       cfg->srcparam1.z * cfg->srcnum)));
         CUDA_ASSERT(cudaMemcpy(gsrcpattern, cfg->srcpattern,
                                sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y*
-                                       cfg->srcparam1.z),
+                                       cfg->srcparam1.z * cfg->srcnum),
                                cudaMemcpyHostToDevice));
     } else {
         gsrcpattern = NULL;
@@ -591,14 +602,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
             param.tstart = twindow0;
             param.tend = twindow1;
 
-            // launch mcxkernel
-            size_t sharedMemSize = sizeof(int);
-
-            if (cfg->issavedet) {
-                sharedMemSize = sizeof(float) * ((int)gpu[gpuid].autoblock) * detreclen;
-            }
 
-            mmc_main_loop <<< mcgrid, mcblock, sharedMemSize>>>(
+            mmc_main_loop <<< mcgrid, mcblock, sharedmemsize>>>(
                 threadphoton, oddphotons, gnode, (int*)gelem, gweight, gdref,
                 gtype, (int*)gfacenb, gsrcelem, gnormal,
                 gdetphoton, gdetected, gseed, (int*)gprogress, genergy, greporter,
@@ -752,7 +757,7 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
                 mcx_fflush(cfg->flog);
 
                 for (i = 0; i < fieldlen; i++) { // accumulate field, can be done in the GPU
-                    field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
+                    field[i] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
                 }
 
                 free(rawfield);
@@ -780,8 +785,6 @@ are more than what your have specified (%d), please use the --maxjumpdebug optio
     #pragma omp master
     {
         int i, j;
-        fieldlen = (fieldlen >> cfg->nbuffer);
-        field = (float*)realloc(field, sizeof(field[0]) * fieldlen);
 
         if (cfg->exportfield) {
             if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
diff --git a/src/mmc_raytrace.c b/src/mmc_raytrace.c
index b0bea51..1a0e5d4 100644
--- a/src/mmc_raytrace.c
+++ b/src/mmc_raytrace.c
@@ -386,12 +386,11 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) {
                     #pragma omp atomic
                     tracer->mesh->weight[eid + tshift] += ww;
                 } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1
-                    int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern
                     int pidx; // pattern index
 
                     for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                         #pragma omp atomic
-                        tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx];
+                        tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                     }
                 }
             }
@@ -437,13 +436,12 @@ float plucker_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) {
                                 tracer->mesh->weight[ee[i] - 1 + tshift] += ww * (baryp0[i] + baryout[i]);
                             }
                         } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1
-                            int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern
                             int pidx; // pattern index
 
                             for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                 for (i = 0; i < 4; i++) {
                                     #pragma omp atomic
-                                    tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx] * (baryp0[i] + baryout[i]);
+                                    tracer->mesh->weight[(ee[i] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx] * (baryp0[i] + baryout[i]);
                                 }
                             }
                         }
@@ -716,12 +714,11 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) {
                         #pragma omp atomic
                         tracer->mesh->weight[eid + tshift] += ww;
                     } else if (cfg->srctype == stPattern) { // must be pattern and srcnum more than 1
-                        int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern
                         int pidx; // pattern index
 
                         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                             #pragma omp atomic
-                            tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx];
+                            tracer->mesh->weight[(eid + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                         }
                     }
                 } else {
@@ -734,13 +731,12 @@ float havel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visitor* visit) {
                             tracer->mesh->weight[ee[j] - 1 + tshift] += barypout[j];
                         }
                     } else if (cfg->srctype == stPattern) {
-                        int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern
                         int pidx; // pattern index
 
                         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                             for (j = 0; j < 4; j++) {
                                 #pragma omp atomic
-                                tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[pidx * psize + r->posidx];
+                                tracer->mesh->weight[(ee[j] - 1 + tshift)*cfg->srcnum + pidx] += barypout[j] * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                             }
                         }
                     }
@@ -1405,7 +1401,6 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
         medium* prop;
         int* enb, *ee = (int*)(tracer->mesh->elem + eid * tracer->mesh->elemlen);
         float mus;
-        int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w; // total number of pixels in each pattern
         int pidx; // pattern index
 
         if (cfg->implicit == 1 && r->inroi && tracer->mesh->edgeroi && fabs(tracer->mesh->edgeroi[eid * 6]) < EPS) {
@@ -1526,7 +1521,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
                             } else if (cfg->srctype == stPattern) {
                                 for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                     #pragma omp atomic
-                                    tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx];
+                                    tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                                 }
                             }
 
@@ -1543,7 +1538,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
                             } else if (cfg->srctype == stPattern) {
                                 for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                     #pragma omp atomic
-                                    tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx];
+                                    tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                                 }
                             }
 
@@ -1577,7 +1572,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
                                     for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                             #pragma omp atomic
-                                            tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx];
+                                            tracer->mesh->weight[r->oldidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                                         }
                                     }
                                 }
@@ -1596,7 +1591,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
                                     for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                                             #pragma omp atomic
-                                            tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[pidx * psize + r->posidx];
+                                            tracer->mesh->weight[newidx * cfg->srcnum + pidx] += r->oldweight * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                                         }
                                     }
                                 }
@@ -1621,7 +1616,7 @@ float branchless_badouel_raytet(ray* r, raytracer* tracer, mcconfig* cfg, visito
                         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
                             for (i = 0; i < 3; i++) {
                                 #pragma omp atomic
-                                tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[pidx * psize + r->posidx];
+                                tracer->mesh->weight[(ee[out[faceidx][i]] - 1 + tshift)*cfg->srcnum + pidx] += ww * cfg->srcpattern[r->posidx * cfg->srcnum + pidx];
                             }
                         }
                     }
@@ -1761,13 +1756,11 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg,
     } else if (cfg->srctype == stPattern) {
         *((int*)(r.partialpath + visit->reclen - 2)) = r.posidx;
 
-        int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w;
-
         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
             if (cfg->seed == SEED_FROM_FILE && (cfg->outputtype == otWL || cfg->outputtype == otWP)) {
                 kahany = cfg->replayweight[r.photonid] - visit->kahanc0[pidx];    /* when replay mode, accumulate detected photon weight */
             } else {
-                kahany = r.weight * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc0[pidx];
+                kahany = r.weight * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc0[pidx];
             }
 
             kahant = visit->launchweight[pidx] + kahany;
@@ -2033,10 +2026,8 @@ void onephoton(size_t id, raytracer* tracer, tetmesh* mesh, mcconfig* cfg,
         visit->kahanc1[0] = (kahant - visit->absorbweight[0]) - kahany;
         visit->absorbweight[0] = kahant;
     } else if (cfg->srctype == stPattern) {
-        int psize = (int)cfg->srcparam1.w * (int)cfg->srcparam2.w;
-
         for (pidx = 0; pidx < cfg->srcnum; pidx++) {
-            kahany = r.Eabsorb * cfg->srcpattern[pidx * psize + r.posidx] - visit->kahanc1[pidx];
+            kahany = r.Eabsorb * cfg->srcpattern[r.posidx * cfg->srcnum + pidx] - visit->kahanc1[pidx];
             kahant = visit->absorbweight[pidx] + kahany;
             visit->kahanc1[pidx] = (kahant - visit->absorbweight[pidx]) - kahany;
             visit->absorbweight[pidx] = kahant;
diff --git a/src/mmc_utils.c b/src/mmc_utils.c
index 9a8192d..aab0d51 100644
--- a/src/mmc_utils.c
+++ b/src/mmc_utils.c
@@ -326,7 +326,6 @@ void mcx_initcfg(mcconfig* cfg) {
     cfg->energyesc = 0.f;
     cfg->runtime = 0;
     cfg->autopilot = 1;
-    cfg->nbuffer = 0;
     cfg->gpuid = 0;
     cfg->maxjumpdebug = 10000000;
     cfg->exportdebugdata = NULL;
@@ -3216,8 +3215,6 @@ void mcx_parsecmd(int argc, char* argv[], mcconfig* cfg) {
                         i = mcx_readarg(argc, argv, i, &(cfg->debugphoton), "int");
                     } else if (strcmp(argv[i] + 2, "maxjumpdebug") == 0) {
                         i = mcx_readarg(argc, argv, i, &(cfg->maxjumpdebug), "int");
-                    } else if (strcmp(argv[i] + 2, "buffer") == 0) {
-                        i = mcx_readarg(argc, argv, i, &(cfg->nbuffer), "int");
                     } else if (strcmp(argv[i] + 2, "gridsize") == 0) {
                         i = mcx_readarg(argc, argv, i, &(cfg->steps.x), "float");
                         cfg->steps.y = cfg->steps.x;
diff --git a/src/mmc_utils.h b/src/mmc_utils.h
index 6086cd4..0e0487a 100644
--- a/src/mmc_utils.h
+++ b/src/mmc_utils.h
@@ -281,7 +281,6 @@ typedef struct MMC_config {
     unsigned int runtime;          /**<total simulation runtime in ms*/
     char autopilot;                /**<1 optimal setting for dedicated card, 2, for non dedicated card*/
     float normalizer;              /**<normalization factor*/
-    unsigned int nbuffer;          /**<2^nbuffer is the number of buffers for accummulation*/
     unsigned int gpuid;            /**<positive integer denotes the 1st/2nd/... OpenCL or CUDA devices, 0xFFFFFFFF for CPU only*/
     int compute;                   /**<0: sse, 1: opencl or 2: cuda*/
     char isdumpjson;               /**<1 to save json */
diff --git a/src/mmclab.cpp b/src/mmclab.cpp
index 1c5ff43..662b34f 100644
--- a/src/mmclab.cpp
+++ b/src/mmclab.cpp
@@ -443,7 +443,6 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig*
     const dimtype* arraydim;
     char* jsonshapes = NULL;
     int i, j;
-    dimtype k;
 
     if (strcmp(name, "nphoton") == 0 && cfg->photonseed != NULL) {
         return;
@@ -778,25 +777,26 @@ void mmc_set_field(const mxArray* root, const mxArray* item, int idx, mcconfig*
         printf("mmc.session='%s';\n", cfg->session);
     } else if (strcmp(name, "srcpattern") == 0) {
         arraydim = mxGetDimensions(item);
+        dimtype dimz = 1;
+
+        if (mxGetNumberOfDimensions(item) == 3) {
+            dimz = arraydim[2];
+            cfg->srcnum = arraydim[0];
+        }
+
         double* val = mxGetPr(item);
 
         if (cfg->srcpattern) {
             free(cfg->srcpattern);
         }
 
-        if (mxGetNumberOfDimensions(item) == 3) {
-            cfg->srcnum = arraydim[2];
-        } else {
-            cfg->srcnum = 1;
-        }
-
-        cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * cfg->srcnum * sizeof(float));
+        cfg->srcpattern = (float*)malloc(arraydim[0] * arraydim[1] * dimz * sizeof(float));
 
-        for (k = 0; k < arraydim[0]*arraydim[1]*cfg->srcnum; k++) {
-            cfg->srcpattern[k] = val[k];
+        for (i = 0; i < arraydim[0]*arraydim[1]*dimz; i++) {
+            cfg->srcpattern[i] = val[i];
         }
 
-        printf("mmc.srcpattern=[%d %d %d];\n", (int)arraydim[0], (int)arraydim[1], cfg->srcnum);
+        printf("mmc.srcpattern=[%ld %ld %ld];\n", arraydim[0], arraydim[1], dimz);
     } else if (strcmp(name, "method") == 0) {
         int len = mxGetNumberOfElements(item);
         const char* methods[] = {"plucker", "havel", "badouel", "elem", "grid", ""};