diff --git a/SDL/Kernels.h b/SDL/Kernels.h index 909ee561..4975c202 100644 --- a/SDL/Kernels.h +++ b/SDL/Kernels.h @@ -12,8 +12,9 @@ namespace SDL { ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmQuintupletFromMemory(struct SDL::quintuplets& quintupletsInGPU, - unsigned int quintupletIndex) { - quintupletsInGPU.isDup[quintupletIndex] = true; + unsigned int quintupletIndex, + bool secondpass = false) { + quintupletsInGPU.isDup[quintupletIndex] |= 1 + secondpass; }; ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelTripletFromMemory(struct SDL::pixelTriplets& pixelTripletsInGPU, @@ -212,10 +213,8 @@ namespace SDL { float phi1 = __H2F(quintupletsInGPU.phi[ix]); float score_rphisum1 = __H2F(quintupletsInGPU.score_rphisum[ix]); - for (unsigned int jx1 = globalThreadIdx[2]; jx1 < nQuintuplets_lowmod; jx1 += gridThreadExtent[2]) { + for (unsigned int jx1 = globalThreadIdx[2] + ix1 + 1; jx1 < nQuintuplets_lowmod; jx1 += gridThreadExtent[2]) { unsigned int jx = quintupletModuleIndices_lowmod + jx1; - if (ix == jx) - continue; float eta2 = __H2F(quintupletsInGPU.eta[jx]); float phi2 = __H2F(quintupletsInGPU.phi[jx]); @@ -231,10 +230,10 @@ namespace SDL { int nMatched = checkHitsT5(ix, jx, quintupletsInGPU); if (nMatched >= 7) { - if (score_rphisum1 > score_rphisum2) { - rmQuintupletFromMemory(quintupletsInGPU, ix); - } else if ((score_rphisum1 == score_rphisum2) && (ix < jx)) { + if (score_rphisum1 >= score_rphisum2) { rmQuintupletFromMemory(quintupletsInGPU, ix); + } else { + rmQuintupletFromMemory(quintupletsInGPU, jx); } } } @@ -260,7 +259,7 @@ namespace SDL { unsigned int quintupletModuleIndices_lowmod1 = rangesInGPU.quintupletModuleIndices[lowmod1]; - for (unsigned int lowmodIdx2 = globalThreadIdx[2]; lowmodIdx2 < *(rangesInGPU.nEligibleT5Modules); + for (unsigned int lowmodIdx2 = globalThreadIdx[2] + lowmodIdx1; lowmodIdx2 < *(rangesInGPU.nEligibleT5Modules); lowmodIdx2 += gridThreadExtent[2]) { uint16_t lowmod2 = rangesInGPU.indicesOfEligibleT5Modules[lowmodIdx2]; unsigned int nQuintuplets_lowmod2 = quintupletsInGPU.nQuintuplets[lowmod2]; @@ -271,7 +270,7 @@ namespace SDL { for (unsigned int ix1 = 0; ix1 < nQuintuplets_lowmod1; ix1 += 1) { unsigned int ix = quintupletModuleIndices_lowmod1 + ix1; - if (quintupletsInGPU.partOfPT5[ix]) + if (quintupletsInGPU.partOfPT5[ix] || (quintupletsInGPU.isDup[ix] & 1)) continue; for (unsigned int jx1 = 0; jx1 < nQuintuplets_lowmod2; jx1++) { @@ -279,7 +278,7 @@ namespace SDL { if (ix == jx) continue; - if (quintupletsInGPU.partOfPT5[jx]) + if (quintupletsInGPU.partOfPT5[jx] || (quintupletsInGPU.isDup[jx] & 1)) continue; float eta1 = __H2F(quintupletsInGPU.eta[ix]); @@ -303,12 +302,11 @@ namespace SDL { int nMatched = checkHitsT5(ix, jx, quintupletsInGPU); if (dR2 < 0.001f || nMatched >= 5) { if (score_rphisum1 > score_rphisum2) { - rmQuintupletFromMemory(quintupletsInGPU, ix); - continue; - } - if ((score_rphisum1 == score_rphisum2) && (ix < jx)) { - rmQuintupletFromMemory(quintupletsInGPU, ix); - continue; + rmQuintupletFromMemory(quintupletsInGPU, ix, true); + } else if (score_rphisum1 < score_rphisum2) { + rmQuintupletFromMemory(quintupletsInGPU, jx, true); + } else { + rmQuintupletFromMemory(quintupletsInGPU, (ix < jx ? ix : jx), true); } } } diff --git a/SDL/Quintuplet.h b/SDL/Quintuplet.h index 13e34d68..a4259c39 100644 --- a/SDL/Quintuplet.h +++ b/SDL/Quintuplet.h @@ -26,7 +26,7 @@ namespace SDL { FPX* phi; FPX* score_rphisum; uint8_t* layer; - bool* isDup; + char* isDup; bool* TightCutFlag; bool* partOfPT5; @@ -85,7 +85,7 @@ namespace SDL { Buf phi_buf; Buf score_rphisum_buf; Buf layer_buf; - Buf isDup_buf; + Buf isDup_buf; Buf TightCutFlag_buf; Buf partOfPT5_buf; @@ -114,7 +114,7 @@ namespace SDL { phi_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), score_rphisum_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), layer_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), - isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), TightCutFlag_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), partOfPT5_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), regressionRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), @@ -127,7 +127,7 @@ namespace SDL { nonAnchorChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)) { alpaka::memset(queue, nQuintuplets_buf, 0u); alpaka::memset(queue, totOccupancyQuintuplets_buf, 0u); - alpaka::memset(queue, isDup_buf, false); + alpaka::memset(queue, isDup_buf, 0u); alpaka::memset(queue, TightCutFlag_buf, false); alpaka::memset(queue, partOfPT5_buf, false); alpaka::wait(queue); @@ -181,7 +181,7 @@ namespace SDL { quintupletsInGPU.phi[quintupletIndex] = __F2H(phi); quintupletsInGPU.score_rphisum[quintupletIndex] = __F2H(scores); quintupletsInGPU.layer[quintupletIndex] = layer; - quintupletsInGPU.isDup[quintupletIndex] = false; + quintupletsInGPU.isDup[quintupletIndex] = 0; quintupletsInGPU.TightCutFlag[quintupletIndex] = TightCutFlag; quintupletsInGPU.regressionRadius[quintupletIndex] = regressionRadius; quintupletsInGPU.regressionG[quintupletIndex] = regressionG; diff --git a/bin/sdl.cc b/bin/sdl.cc index 3adb17a2..3c37053d 100644 --- a/bin/sdl.cc +++ b/bin/sdl.cc @@ -298,12 +298,15 @@ int main(int argc, char** argv) void run_sdl() { SDL::Dev devAcc = alpaka::getDevByIdx(ALPAKA_ACCELERATOR_NAMESPACE::Platform{}, 0u); - SDL::QueueAcc queue(devAcc); + std::vector queues; + for (int s = 0; s < ana.streams; s++) { + queues.push_back(SDL::QueueAcc(devAcc)); + } // Load various maps used in the SDL reconstruction TStopwatch full_timer; full_timer.Start(); - loadMaps(devAcc, queue); + loadMaps(devAcc, queues[0]); float timeForMapLoading = full_timer.RealTime()*1000; if (ana.do_write_ntuple) @@ -385,7 +388,7 @@ void run_sdl() std::vector*> events; for (int s = 0; s < ana.streams; s++) { - SDL::Event *event = new SDL::Event(ana.verbose>=2, queue); + SDL::Event *event = new SDL::Event(ana.verbose>=2, queues[s]); events.push_back(event); } float timeForEventCreation = full_timer.RealTime()*1000;