From cacd983117d2ef70def8df3ef57d66416d905ccc Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 18 May 2023 19:12:08 -0400 Subject: [PATCH 01/44] remove unused memory functions --- SDL/MiniDoublet.cu | 8 -------- SDL/MiniDoublet.cuh | 1 - SDL/Module.cu | 33 --------------------------------- SDL/Module.cuh | 1 - SDL/PixelTriplet.cu | 25 ------------------------- SDL/PixelTriplet.cuh | 2 -- SDL/Segment.cu | 22 ---------------------- SDL/Segment.cuh | 1 - SDL/TrackCandidate.cu | 20 -------------------- SDL/TrackCandidate.cuh | 1 - SDL/Triplet.cu | 11 ----------- SDL/Triplet.cuh | 1 - 12 files changed, 126 deletions(-) diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu index f3fb98bf..3cfab8dd 100644 --- a/SDL/MiniDoublet.cu +++ b/SDL/MiniDoublet.cu @@ -1,13 +1,5 @@ #include "MiniDoublet.cuh" -void SDL::miniDoublets::resetMemory(unsigned int nMemoryLocationsx, unsigned int nLowerModules,cudaStream_t stream) -{ - cudaMemsetAsync(anchorHitIndices,0, nMemoryLocationsx * 3 * sizeof(unsigned int),stream); - cudaMemsetAsync(dphichanges,0, nMemoryLocationsx * 9 * sizeof(float),stream); - cudaMemsetAsync(nMDs,0, (nLowerModules + 1) * sizeof(int),stream); - cudaMemsetAsync(totOccupancyMDs,0, (nLowerModules + 1) * sizeof(unsigned int),stream); -} - //FIXME:Add memory locations for the pixel MDs here! void SDL::createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream) { diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 858a7fec..b8db05ca 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -64,7 +64,6 @@ namespace SDL ~miniDoublets(); void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int nMemoryLocations, unsigned int nModules,cudaStream_t stream); }; void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream); diff --git a/SDL/Module.cu b/SDL/Module.cu index 44427b52..e26eb899 100644 --- a/SDL/Module.cu +++ b/SDL/Module.cu @@ -163,39 +163,6 @@ void SDL::objectRanges::freeMemory() cudaFree(device_nTotalQuints); } -void SDL::freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping) -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,modulesInGPU.detIds); - cms::cuda::free_device(dev,modulesInGPU.moduleMap); - cms::cuda::free_device(dev,modulesInGPU.mapIdx); - cms::cuda::free_device(dev,modulesInGPU.mapdetId); - cms::cuda::free_device(dev,modulesInGPU.nConnectedModules); - cms::cuda::free_device(dev,modulesInGPU.drdzs); - cms::cuda::free_device(dev,modulesInGPU.slopes); - cms::cuda::free_device(dev,modulesInGPU.nModules); - cms::cuda::free_device(dev,modulesInGPU.nLowerModules); - cms::cuda::free_device(dev,modulesInGPU.layers); - cms::cuda::free_device(dev,modulesInGPU.rings); - cms::cuda::free_device(dev,modulesInGPU.modules); - cms::cuda::free_device(dev,modulesInGPU.rods); - cms::cuda::free_device(dev,modulesInGPU.subdets); - cms::cuda::free_device(dev,modulesInGPU.sides); - cms::cuda::free_device(dev,modulesInGPU.isInverted); - cms::cuda::free_device(dev,modulesInGPU.isLower); - cms::cuda::free_device(dev,modulesInGPU.isAnchor); - cms::cuda::free_device(dev,modulesInGPU.moduleType); - cms::cuda::free_device(dev,modulesInGPU.moduleLayerType); - cms::cuda::free_device(dev,modulesInGPU.connectedPixels); - cudaFreeHost(pixelMapping.connectedPixelsSizes); - cudaFreeHost(pixelMapping.connectedPixelsSizesPos); - cudaFreeHost(pixelMapping.connectedPixelsSizesNeg); - cudaFreeHost(pixelMapping.connectedPixelsIndex); - cudaFreeHost(pixelMapping.connectedPixelsIndexPos); - cudaFreeHost(pixelMapping.connectedPixelsIndexNeg); -} - void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping) { cudaFree(modulesInGPU.detIds); diff --git a/SDL/Module.cuh b/SDL/Module.cuh index c68d6a77..6e48abaf 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -143,7 +143,6 @@ namespace SDL void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt"); void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream); void freeModules(struct modules& modulesInGPU,struct pixelMap& pixelMapping); - void freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping); void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream); void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); diff --git a/SDL/PixelTriplet.cu b/SDL/PixelTriplet.cu index 456eb7d7..dc0a2496 100644 --- a/SDL/PixelTriplet.cu +++ b/SDL/PixelTriplet.cu @@ -69,19 +69,6 @@ SDL::pixelTriplets::~pixelTriplets() { } -void SDL::pixelTriplets::resetMemory(unsigned int maxPixelTriplets,cudaStream_t stream) -{ - cudaMemsetAsync(pixelSegmentIndices,0, maxPixelTriplets * sizeof(unsigned int),stream); - cudaMemsetAsync(tripletIndices, 0,maxPixelTriplets * sizeof(unsigned int),stream); - cudaMemsetAsync(nPixelTriplets, 0,sizeof(int),stream); - cudaMemsetAsync(totOccupancyPixelTriplets, 0,sizeof(int),stream); - cudaMemsetAsync(pixelRadius, 0,maxPixelTriplets * sizeof(FPX),stream); - cudaMemsetAsync(tripletRadius, 0,maxPixelTriplets * sizeof(FPX),stream); - cudaMemsetAsync(pt, 0,maxPixelTriplets * 6*sizeof(FPX),stream); - cudaMemsetAsync(isDup, 0,maxPixelTriplets * sizeof(bool),stream); - cudaMemsetAsync(partOfPT5, 0,maxPixelTriplets * sizeof(bool),stream); -} - void SDL::createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsInGPU, unsigned int maxPixelTriplets, cudaStream_t stream) { #ifdef CACHE_ALLOC @@ -205,18 +192,6 @@ void SDL::pixelQuintuplets::freeMemory(cudaStream_t stream) cudaStreamSynchronize(stream); } -void SDL::pixelQuintuplets::resetMemory(unsigned int maxPixelQuintuplets,cudaStream_t stream) -{ - cudaMemsetAsync(pixelIndices,0, maxPixelQuintuplets * sizeof(unsigned int),stream); - cudaMemsetAsync(T5Indices,0, maxPixelQuintuplets * sizeof(unsigned int),stream); - cudaMemsetAsync(nPixelQuintuplets,0, sizeof(int),stream); - cudaMemsetAsync(totOccupancyPixelQuintuplets,0, sizeof(int),stream); - cudaMemsetAsync(isDup,0, maxPixelQuintuplets * sizeof(bool),stream); - cudaMemsetAsync(score,0, maxPixelQuintuplets * sizeof(FPX),stream); - cudaMemsetAsync(eta , 0, maxPixelQuintuplets * sizeof(FPX),stream); - cudaMemsetAsync(phi , 0, maxPixelQuintuplets * sizeof(FPX),stream); -} - void SDL::createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets,cudaStream_t stream) { #ifdef CACHE_ALLOC diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 5fe49aa3..0f884bae 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -44,7 +44,6 @@ namespace SDL ~pixelTriplets(); void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int maxPixelTriplets,cudaStream_t stream); }; void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); @@ -1381,7 +1380,6 @@ namespace SDL ~pixelQuintuplets(); void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int maxPixelQuintuplets,cudaStream_t stream); }; diff --git a/SDL/Segment.cu b/SDL/Segment.cu index 79bd89a1..3d5f38eb 100644 --- a/SDL/Segment.cu +++ b/SDL/Segment.cu @@ -2,28 +2,6 @@ ///FIXME:NOTICE THE NEW maxPixelSegments! -void SDL::segments::resetMemory(unsigned int nMemoryLocationsx, unsigned int nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream) -{ - cudaMemsetAsync(mdIndices,0, nMemoryLocationsx * 2 * sizeof(unsigned int),stream); - cudaMemsetAsync(innerLowerModuleIndices,0, nMemoryLocationsx * 2 * sizeof(uint16_t),stream); - cudaMemsetAsync(nSegments, 0,(nLowerModules+1) * sizeof(int),stream); - cudaMemsetAsync(totOccupancySegments, 0,(nLowerModules+1) * sizeof(int),stream); - cudaMemsetAsync(dPhis, 0,(nMemoryLocationsx * 6 )*sizeof(FPX),stream); - cudaMemsetAsync(ptIn, 0,(maxPixelSegments * 8)*sizeof(float),stream); - cudaMemsetAsync(superbin, 0,(maxPixelSegments )*sizeof(int),stream); - cudaMemsetAsync(pixelType, 0,(maxPixelSegments )*sizeof(int8_t),stream); - cudaMemsetAsync(isQuad, 0,(maxPixelSegments )*sizeof(char),stream); - cudaMemsetAsync(isDup, 0,(maxPixelSegments )*sizeof(bool),stream); - cudaMemsetAsync(score, 0,(maxPixelSegments )*sizeof(float),stream); - cudaMemsetAsync(charge, 0,maxPixelSegments * sizeof(int),stream); - cudaMemsetAsync(seedIdx, 0,maxPixelSegments * sizeof(unsigned int),stream); - cudaMemsetAsync(circleCenterX, 0,maxPixelSegments * sizeof(float),stream); - cudaMemsetAsync(circleCenterY, 0,maxPixelSegments * sizeof(float),stream); - cudaMemsetAsync(circleRadius, 0,maxPixelSegments * sizeof(float),stream); - cudaMemsetAsync(partOfPT5, 0,maxPixelSegments * sizeof(bool),stream); - cudaMemsetAsync(pLSHitsIdxs, 0,maxPixelSegments * sizeof(uint4),stream); -} - void SDL::createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelSegments, cudaStream_t stream) { //FIXME:Since the number of pixel segments is 10x the number of regular segments per module, we need to provide diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 105e7ff1..0a9830cf 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -56,7 +56,6 @@ namespace SDL void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int nMemoryLocationsx, unsigned int nModules, unsigned int maxPixelSegments,cudaStream_t stream); }; void createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int maxSegments, uint16_t nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream); diff --git a/SDL/TrackCandidate.cu b/SDL/TrackCandidate.cu index d7c6dfdf..7853de30 100644 --- a/SDL/TrackCandidate.cu +++ b/SDL/TrackCandidate.cu @@ -1,25 +1,5 @@ #include "TrackCandidate.cuh" -void SDL::trackCandidates::resetMemory(unsigned int maxTrackCandidates,cudaStream_t stream) -{ - cudaMemsetAsync(trackCandidateType,0, maxTrackCandidates * sizeof(short),stream); - cudaMemsetAsync(directObjectIndices, 0, maxTrackCandidates * sizeof(unsigned int),stream); - cudaMemsetAsync(objectIndices, 0,2 * maxTrackCandidates * sizeof(unsigned int),stream); - cudaMemsetAsync(nTrackCandidates, 0,sizeof(int),stream); - cudaMemsetAsync(nTrackCandidatespT3, 0,sizeof(int),stream); - cudaMemsetAsync(nTrackCandidatesT5, 0,sizeof(int),stream); - cudaMemsetAsync(nTrackCandidatespT5,0, sizeof(int),stream); - cudaMemsetAsync(nTrackCandidatespLS, 0,sizeof(int),stream); - - cudaMemsetAsync(logicalLayers, 0, 7 * maxTrackCandidates * sizeof(uint8_t), stream); - cudaMemsetAsync(lowerModuleIndices, 0, 7 * maxTrackCandidates * sizeof(uint16_t), stream); - cudaMemsetAsync(hitIndices, 0, 14 * maxTrackCandidates * sizeof(unsigned int), stream); - cudaMemsetAsync(pixelSeedIndex, 0, maxTrackCandidates * sizeof(int), stream); - cudaMemsetAsync(centerX, 0, maxTrackCandidates * sizeof(FPX), stream); - cudaMemsetAsync(centerY, 0, maxTrackCandidates * sizeof(FPX), stream); - cudaMemsetAsync(radius , 0, maxTrackCandidates * sizeof(FPX), stream); -} - void SDL::createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream) { #ifdef CACHE_ALLOC diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index d8221ee2..d81a570d 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -36,7 +36,6 @@ namespace SDL ~trackCandidates(); void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int maxTrackCandidates,cudaStream_t stream); }; void createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream); diff --git a/SDL/Triplet.cu b/SDL/Triplet.cu index e568cc0a..218880e2 100644 --- a/SDL/Triplet.cu +++ b/SDL/Triplet.cu @@ -1,16 +1,5 @@ #include "Triplet.cuh" -void SDL::triplets::resetMemory(unsigned int maxTriplets, unsigned int nLowerModules,cudaStream_t stream) -{ - cudaMemsetAsync(segmentIndices,0, 5 * maxTriplets * sizeof(unsigned int),stream); - cudaMemsetAsync(nTriplets,0, nLowerModules * sizeof(unsigned int),stream); - cudaMemsetAsync(totOccupancyTriplets,0, nLowerModules * sizeof(unsigned int),stream); - cudaMemsetAsync(betaIn,0, maxTriplets * 3 * sizeof(FPX),stream); - cudaMemsetAsync(partOfPT5,0, maxTriplets * sizeof(bool),stream); - cudaMemsetAsync(partOfT5,0, maxTriplets * sizeof(bool), stream); - cudaMemsetAsync(partOfPT3, 0, maxTriplets * sizeof(bool), stream); -} - void SDL::createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules, cudaStream_t stream) { #ifdef CACHE_ALLOC diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index a0278931..16ea085d 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -54,7 +54,6 @@ namespace SDL ~triplets(); void freeMemory(cudaStream_t stream); void freeMemoryCache(); - void resetMemory(unsigned int maxTriplets, unsigned int nLowerModules,cudaStream_t stream); }; void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream); From efa60af2d43f4072ca0b7fb58c5465045bca73d0 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 13:24:07 -0400 Subject: [PATCH 02/44] first working segment memory w/o ntuple --- Makefile | 10 +-- SDL/Constants.cuh | 48 ++++++++++++- SDL/Event.cu | 93 +++++++++++------------- SDL/Event.cuh | 14 +--- SDL/Makefile | 4 +- SDL/Segment.cu | 179 ---------------------------------------------- SDL/Segment.cuh | 159 +++++++++++++++++++++++++++++++++++----- 7 files changed, 236 insertions(+), 271 deletions(-) delete mode 100644 SDL/Segment.cu diff --git a/Makefile b/Makefile index 2f4210b5..e4f18272 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,9 @@ CXXFLAGS = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual LDFLAGS = -g -O2 ROOTLIBS = $(shell root-config --libs) ROOTCFLAGS = $(foreach option, $(shell root-config --cflags), --compiler-options $(option)) -CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr -DALPAKA_DEBUG=0 +CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O0 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include EXTRACFLAGS = $(shell rooutil-config) EXTRAFLAGS = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET @@ -46,13 +48,13 @@ cutvalue_primitive: $(ROOUTIL) efficiency $(EXES) bin/doAnalysis: bin/doAnalysis.o $(OBJECTS) - $(CC) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) -o $@ + $(CC) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@ bin/sdl: bin/sdl.o $(OBJECTS) - $(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) -o $@ + $(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@ %.o: %.cc - $(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $< -dc -o $@ + $(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -dc -o $@ $(ROOUTIL): $(MAKE) -C code/rooutil/ diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 9990a402..b45e45e4 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -1,9 +1,8 @@ #ifndef Constants_cuh #define Constants_cuh -#include - #include +#include #ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters #define __F2H __float2half @@ -51,6 +50,51 @@ typedef __half FPX_seg; typedef float FPX_seg; #endif +using Idx = std::size_t; +using Dim = alpaka::DimInt<3u>; +using Dim1d = alpaka::DimInt<1u>; +using Vec = alpaka::Vec; +using Vec1d = alpaka::Vec; +using QueueProperty = alpaka::NonBlocking; +using WorkDiv = alpaka::WorkDivMembers; + +// - AccGpuCudaRt +// - AccCpuThreads +// - AccCpuFibers +// - AccCpuSerial +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + using Acc = alpaka::AccGpuCudaRt; +#elif ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + using Acc = alpaka::AccCpuThreads; +#elif ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED + using Acc = alpaka::AccCpuFibers; +#elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + using Acc = alpaka::AccCpuSerial; +#endif + +auto const devAcc = alpaka::getDevByIdx(0u); +using QueueAcc = alpaka::Queue; + +// Typical Buffer types used in the code. +using float_Buf = alpaka::Buf; +using int_Buf = alpaka::Buf; +using uint_Buf = alpaka::Buf; +using int8_t_Buf = alpaka::Buf; +using uint16_t_Buf = alpaka::Buf; +using char_Buf = alpaka::Buf; +using bool_Buf = alpaka::Buf; + +using FPX_Buf = alpaka::Buf; +using FPX_T5_Buf = alpaka::Buf; +using FPX_dPhi_Buf = alpaka::Buf; +using FPX_circle_Buf = alpaka::Buf; +using FPX_seg_Buf = alpaka::Buf; + +template +alpaka::Buf inline allocBufWrapper(TAcc const & devAcc, TSize nElements) { + return alpaka::allocBuf(devAcc, alpaka::Vec(static_cast(nElements))); +} + const unsigned int MAX_BLOCKS = 80; const unsigned int MAX_CONNECTED_MODULES = 40; const unsigned int N_MAX_PIXEL_MD_PER_MODULES = 100000; diff --git a/SDL/Event.cu b/SDL/Event.cu index 7bafc30e..a46c1a9a 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -66,18 +66,15 @@ SDL::Event::~Event() if(rangesInGPU){rangesInGPU->freeMemoryCache();} if(hitsInGPU){hitsInGPU->freeMemoryCache();} if(mdsInGPU){mdsInGPU->freeMemoryCache();} - if(segmentsInGPU){segmentsInGPU->freeMemoryCache();} if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else - if(rangesInGPU){rangesInGPU->freeMemory();} if(hitsInGPU){hitsInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} - if(segmentsInGPU){segmentsInGPU->freeMemory(stream);} if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} @@ -266,18 +263,15 @@ void SDL::Event::resetEvent() if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(rangesInGPU){rangesInGPU->freeMemoryCache();} - if(segmentsInGPU){segmentsInGPU->freeMemoryCache();} if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} - #else if(hitsInGPU){hitsInGPU->freeMemory();} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(rangesInGPU){rangesInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} - if(segmentsInGPU){segmentsInGPU->freeMemory(stream);} if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} @@ -308,7 +302,7 @@ void SDL::Event::resetEvent() mdsInGPU = nullptr;} if(rangesInGPU){cms::cuda::free_host(rangesInGPU); rangesInGPU = nullptr;} - if(segmentsInGPU){cms::cuda::free_host(segmentsInGPU); + if(segmentsInGPU){delete segmentsInGPU; segmentsInGPU = nullptr;} if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU); tripletsInGPU = nullptr;} @@ -747,7 +741,7 @@ struct addPixelSegmentToEventKernel void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,std::vector hitIndices1,std::vector hitIndices2,std::vector hitIndices3, std::vector dPhiChange, std::vector ptIn, std::vector ptErr, std::vector px, std::vector py, std::vector pz, std::vector eta, std::vector etaErr, std::vector phi, std::vector charge, std::vector seedIdx, std::vector superbin, std::vector pixelType, std::vector isQuad) { - const int size = ptIn.size(); + int size = ptIn.size(); unsigned int mdSize = 2 * size; uint16_t pixelModuleIndex = (*detIdToIndex)[1]; @@ -782,7 +776,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st } if(segmentsInGPU == nullptr) { - segmentsInGPU = (SDL::segments*)cms::cuda::allocate_host(sizeof(SDL::segments), stream); // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously. // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them @@ -804,43 +797,47 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st cudaMemcpyAsync(&nTotalSegments,rangesInGPU->device_nTotalSegs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE; - createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream); + + segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);; cudaStreamSynchronize(stream); } - unsigned int* hitIndices0_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream); - unsigned int* hitIndices1_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream); - unsigned int* hitIndices2_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream); - unsigned int* hitIndices3_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream); - float* dPhiChange_dev = (float*)cms::cuda::allocate_device(dev, size*sizeof(float), stream); - - cudaMemcpyAsync(hitIndices0_dev, &hitIndices0[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitIndices1_dev, &hitIndices1[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitIndices2_dev, &hitIndices2[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitIndices3_dev, &hitIndices3[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(dPhiChange_dev, &dPhiChange[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - - cudaMemcpyAsync(segmentsInGPU->isQuad, &isQuad[0], size*sizeof(char), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->ptIn, &ptIn[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->ptErr, &ptErr[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->px, &px[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->py, &py[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->pz, &pz[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->etaErr, &etaErr[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->eta, &eta[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->phi, &phi[0], size*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->charge, &charge[0], size*sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->seedIdx, &seedIdx[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->superbin, &superbin[0], size*sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(segmentsInGPU->pixelType, &pixelType[0], size*sizeof(int8_t), cudaMemcpyHostToDevice, stream); + alpaka::Vec const extent(static_cast(size)); + + auto hitIndices0_dev = alpaka::allocBuf(devAcc, extent); + auto hitIndices1_dev = alpaka::allocBuf(devAcc, extent); + auto hitIndices2_dev = alpaka::allocBuf(devAcc, extent); + auto hitIndices3_dev = alpaka::allocBuf(devAcc, extent); + auto dPhiChange_dev = alpaka::allocBuf(devAcc, extent); + + alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size); + alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size); + alpaka::memcpy(queue, hitIndices2_dev, hitIndices2, size); + alpaka::memcpy(queue, hitIndices3_dev, hitIndices3, size); + alpaka::memcpy(queue, dPhiChange_dev, dPhiChange, size); + + alpaka::memcpy(queue, segmentsInGPU->ptIn_buf, ptIn, size); + alpaka::memcpy(queue, segmentsInGPU->ptErr_buf, ptErr, size); + alpaka::memcpy(queue, segmentsInGPU->px_buf, px, size); + alpaka::memcpy(queue, segmentsInGPU->py_buf, py, size); + alpaka::memcpy(queue, segmentsInGPU->pz_buf, pz, size); + alpaka::memcpy(queue, segmentsInGPU->etaErr_buf, etaErr, size); + alpaka::memcpy(queue, segmentsInGPU->isQuad_buf, isQuad, size); + alpaka::memcpy(queue, segmentsInGPU->eta_buf, eta, size); + alpaka::memcpy(queue, segmentsInGPU->phi_buf, phi, size); + alpaka::memcpy(queue, segmentsInGPU->charge_buf, charge, size); + alpaka::memcpy(queue, segmentsInGPU->seedIdx_buf, seedIdx, size); + alpaka::memcpy(queue, segmentsInGPU->superbin_buf, superbin, size); + alpaka::memcpy(queue, segmentsInGPU->pixelType_buf, pixelType, size); cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(&(mdsInGPU->nMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(&(mdsInGPU->totOccupancyMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); + alpaka::wait(queue); Vec const threadsPerBlock(static_cast(1), static_cast(1), static_cast(256)); Vec const blocksPerGrid(static_cast(1), static_cast(1), static_cast(MAX_BLOCKS)); @@ -855,23 +852,16 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st *hitsInGPU, *mdsInGPU, *segmentsInGPU, - hitIndices0_dev, - hitIndices1_dev, - hitIndices2_dev, - hitIndices3_dev, - dPhiChange_dev, + alpaka::getPtrNative(hitIndices0_dev), + alpaka::getPtrNative(hitIndices1_dev), + alpaka::getPtrNative(hitIndices2_dev), + alpaka::getPtrNative(hitIndices3_dev), + alpaka::getPtrNative(dPhiChange_dev), pixelModuleIndex, size)); alpaka::enqueue(queue, addPixelSegmentToEvent_task); alpaka::wait(queue); - - cms::cuda::free_device(dev, hitIndices0_dev); - cms::cuda::free_device(dev, hitIndices1_dev); - cms::cuda::free_device(dev, hitIndices2_dev); - cms::cuda::free_device(dev, hitIndices3_dev); - cms::cuda::free_device(dev, dPhiChange_dev); - cudaStreamSynchronize(stream); } void SDL::Event::addMiniDoubletsToEventExplicit() @@ -1055,8 +1045,7 @@ void SDL::Event::createSegmentsWithModuleMap() { if(segmentsInGPU == nullptr) { - segmentsInGPU = (SDL::segments*)cms::cuda::allocate_host(sizeof(SDL::segments), stream); - createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream); + segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); } Vec const threadsPerBlockCreateSeg(static_cast(1), static_cast(1), static_cast(64)); @@ -1133,7 +1122,7 @@ void SDL::Event::createTriplets() uint16_t *index_gpu; index_gpu = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules*sizeof(uint16_t), stream); unsigned int *nSegments = (unsigned int*)malloc(nLowerModules*sizeof(unsigned int)); - cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); + cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); uint16_t* module_nConnectedModules; @@ -1141,7 +1130,7 @@ void SDL::Event::createTriplets() cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); - for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex nSegments = new int[nLowerModules+1]; cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream); diff --git a/SDL/Event.cuh b/SDL/Event.cuh index dad2c933..52e14448 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -14,27 +14,15 @@ #include "allocate.h" -// Temporary alpaka statements -using Dim = alpaka::DimInt<3u>; -using Idx = std::size_t; -using Vec = alpaka::Vec; -using QueueProperty = alpaka::NonBlocking; -using WorkDiv = alpaka::WorkDivMembers; - namespace SDL { class Event { private: + QueueAcc queue; cudaStream_t stream; bool addObjects; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - using Acc = alpaka::AccGpuCudaRt; - using QueueAcc = alpaka::Queue; - QueueAcc queue; -#endif - std::array n_hits_by_layer_barrel_; std::array n_hits_by_layer_endcap_; std::array n_minidoublets_by_layer_barrel_; diff --git a/SDL/Makefile b/SDL/Makefile index a146a93d..c518ee68 100644 --- a/SDL/Makefile +++ b/SDL/Makefile @@ -20,7 +20,7 @@ LIB=libsdl.so CXX = nvcc CXXFLAGS = -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I.. ROOTCFLAGS = --compiler-options -pthread --compiler-options -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include -ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr -DALPAKA_DEBUG=0 LD = nvcc SOFLAGS = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72 @@ -45,7 +45,7 @@ CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG $(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@ %_cpu.o : %.cc %.h - $(LD) -O2 $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ + $(LD) -O0 $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ $(LIB):$(CCOBJECTS) $(CUOBJECTS) #$(LIB):$(CUOBJECTS) diff --git a/SDL/Segment.cu b/SDL/Segment.cu deleted file mode 100644 index 3d5f38eb..00000000 --- a/SDL/Segment.cu +++ /dev/null @@ -1,179 +0,0 @@ -#include "Segment.cuh" - -///FIXME:NOTICE THE NEW maxPixelSegments! - -void SDL::createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelSegments, cudaStream_t stream) -{ - //FIXME:Since the number of pixel segments is 10x the number of regular segments per module, we need to provide - //extra memory to the pixel segments -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - segmentsInGPU.mdIndices = (unsigned int*)cms::cuda::allocate_device(dev,nMemoryLocations*4 *sizeof(unsigned int),stream); - segmentsInGPU.innerLowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev,nMemoryLocations*2 *sizeof(uint16_t),stream); - segmentsInGPU.nSegments = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream); - segmentsInGPU.totOccupancySegments = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(unsigned int),stream); - segmentsInGPU.dPhis = (FPX*)cms::cuda::allocate_device(dev,nMemoryLocations*6 *sizeof(FPX),stream); - segmentsInGPU.ptIn = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * 8 *sizeof(float),stream); - segmentsInGPU.superbin = (int*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(int),stream); - segmentsInGPU.pixelType = (int8_t*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(int8_t),stream); - segmentsInGPU.isQuad = (char*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(char),stream); - segmentsInGPU.isDup = (bool*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(bool),stream); - segmentsInGPU.score = (float*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(float),stream); - segmentsInGPU.charge = (int*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(int), stream); - segmentsInGPU.seedIdx = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(unsigned int), stream); - segmentsInGPU.circleCenterX = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream); - segmentsInGPU.circleCenterY = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream); - segmentsInGPU.circleRadius = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream); - segmentsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(bool), stream); - segmentsInGPU.pLSHitsIdxs = (uint4*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(uint4), stream); - segmentsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); -#else - cudaMalloc(&segmentsInGPU.mdIndices, nMemoryLocations * 4 * sizeof(unsigned int)); - cudaMalloc(&segmentsInGPU.innerLowerModuleIndices, nMemoryLocations * 2 * sizeof(uint16_t)); - cudaMalloc(&segmentsInGPU.nSegments, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&segmentsInGPU.totOccupancySegments, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&segmentsInGPU.dPhis, nMemoryLocations * 6 *sizeof(FPX)); - cudaMalloc(&segmentsInGPU.ptIn, maxPixelSegments * 8*sizeof(float)); - cudaMalloc(&segmentsInGPU.superbin, (maxPixelSegments )*sizeof(int)); - cudaMalloc(&segmentsInGPU.pixelType, (maxPixelSegments )*sizeof(int8_t)); - cudaMalloc(&segmentsInGPU.isQuad, (maxPixelSegments )*sizeof(char)); - cudaMalloc(&segmentsInGPU.isDup, (maxPixelSegments )*sizeof(bool)); - cudaMalloc(&segmentsInGPU.score, (maxPixelSegments )*sizeof(float)); - cudaMalloc(&segmentsInGPU.charge, maxPixelSegments * sizeof(int)); - cudaMalloc(&segmentsInGPU.seedIdx, maxPixelSegments * sizeof(unsigned int)); - cudaMalloc(&segmentsInGPU.circleCenterX, maxPixelSegments * sizeof(float)); - cudaMalloc(&segmentsInGPU.circleCenterY, maxPixelSegments * sizeof(float)); - cudaMalloc(&segmentsInGPU.circleRadius, maxPixelSegments * sizeof(float)); - cudaMalloc(&segmentsInGPU.partOfPT5, maxPixelSegments * sizeof(bool)); - cudaMalloc(&segmentsInGPU.pLSHitsIdxs, maxPixelSegments * sizeof(uint4)); - cudaMalloc(&segmentsInGPU.nMemoryLocations, sizeof(unsigned int)); -#endif - segmentsInGPU.outerLowerModuleIndices = segmentsInGPU.innerLowerModuleIndices + nMemoryLocations; - segmentsInGPU.innerMiniDoubletAnchorHitIndices = segmentsInGPU.mdIndices + nMemoryLocations * 2; - segmentsInGPU.outerMiniDoubletAnchorHitIndices = segmentsInGPU.mdIndices + nMemoryLocations * 3; - - segmentsInGPU.dPhiMins = segmentsInGPU.dPhis + nMemoryLocations; - segmentsInGPU.dPhiMaxs = segmentsInGPU.dPhis + nMemoryLocations * 2; - segmentsInGPU.dPhiChanges = segmentsInGPU.dPhis + nMemoryLocations * 3; - segmentsInGPU.dPhiChangeMins = segmentsInGPU.dPhis + nMemoryLocations * 4; - segmentsInGPU.dPhiChangeMaxs = segmentsInGPU.dPhis + nMemoryLocations * 5; - - segmentsInGPU.ptErr = segmentsInGPU.ptIn + maxPixelSegments; - segmentsInGPU.px = segmentsInGPU.ptIn + maxPixelSegments * 2; - segmentsInGPU.py = segmentsInGPU.ptIn + maxPixelSegments * 3; - segmentsInGPU.pz = segmentsInGPU.ptIn + maxPixelSegments * 4; - segmentsInGPU.etaErr = segmentsInGPU.ptIn + maxPixelSegments * 5; - segmentsInGPU.eta = segmentsInGPU.ptIn + maxPixelSegments * 6; - segmentsInGPU.phi = segmentsInGPU.ptIn + maxPixelSegments * 7; - - cudaMemsetAsync(segmentsInGPU.nSegments,0, (nLowerModules + 1) * sizeof(int),stream); - cudaMemsetAsync(segmentsInGPU.totOccupancySegments,0, (nLowerModules + 1) * sizeof(int),stream); - cudaMemsetAsync(segmentsInGPU.partOfPT5, false, maxPixelSegments * sizeof(bool),stream); - cudaMemsetAsync(segmentsInGPU.pLSHitsIdxs, 0, maxPixelSegments * sizeof(uint4),stream); - cudaMemsetAsync(segmentsInGPU.nMemoryLocations, nMemoryLocations, sizeof(unsigned int), stream); - cudaStreamSynchronize(stream); -} - -SDL::segments::segments() -{ - superbin = nullptr; - pixelType = nullptr; - isQuad = nullptr; - isDup = nullptr; - score = nullptr; - circleRadius = nullptr; - charge = nullptr; - seedIdx = nullptr; - circleCenterX = nullptr; - circleCenterY = nullptr; - mdIndices = nullptr; - innerLowerModuleIndices = nullptr; - outerLowerModuleIndices = nullptr; - innerMiniDoubletAnchorHitIndices = nullptr; - outerMiniDoubletAnchorHitIndices = nullptr; - - nSegments = nullptr; - totOccupancySegments = nullptr; - dPhis = nullptr; - dPhiMins = nullptr; - dPhiMaxs = nullptr; - dPhiChanges = nullptr; - dPhiChangeMins = nullptr; - dPhiChangeMaxs = nullptr; - partOfPT5 = nullptr; - pLSHitsIdxs = nullptr; -} - -SDL::segments::~segments() -{ -} - -void SDL::segments::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,mdIndices); - cms::cuda::free_device(dev,innerLowerModuleIndices); - cms::cuda::free_device(dev,dPhis); - cms::cuda::free_device(dev,ptIn); - cms::cuda::free_device(dev,nSegments); - cms::cuda::free_device(dev,totOccupancySegments); - cms::cuda::free_device(dev, charge); - cms::cuda::free_device(dev, seedIdx); - cms::cuda::free_device(dev,superbin); - cms::cuda::free_device(dev,pixelType); - cms::cuda::free_device(dev,isQuad); - cms::cuda::free_device(dev,isDup); - cms::cuda::free_device(dev,score); - cms::cuda::free_device(dev, circleCenterX); - cms::cuda::free_device(dev, circleCenterY); - cms::cuda::free_device(dev, circleRadius); - cms::cuda::free_device(dev, partOfPT5); - cms::cuda::free_device(dev, pLSHitsIdxs); - cms::cuda::free_device(dev, nMemoryLocations); -} - -void SDL::segments::freeMemory(cudaStream_t stream) -{ - cudaFree(mdIndices); - cudaFree(innerLowerModuleIndices); - cudaFree(nSegments); - cudaFree(totOccupancySegments); - cudaFree(dPhis); - cudaFree(ptIn); - cudaFree(superbin); - cudaFree(pixelType); - cudaFree(isQuad); - cudaFree(isDup); - cudaFree(score); - cudaFree(charge); - cudaFree(seedIdx); - cudaFree(circleCenterX); - cudaFree(circleCenterY); - cudaFree(circleRadius); - cudaFree(partOfPT5); - cudaFree(pLSHitsIdxs); - cudaFree(nMemoryLocations); -} - -void SDL::printSegment(struct SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct SDL::modules& modulesInGPU, unsigned int segmentIndex) -{ - unsigned int innerMDIndex = segmentsInGPU.mdIndices[segmentIndex * 2]; - unsigned int outerMDIndex = segmentsInGPU.mdIndices[segmentIndex * 2 + 1]; - std::cout<; + namespace SDL { struct segments { - unsigned int* nMemoryLocations; + // Buffer objects for each member variable + FPX_Buf dPhis_buf; + FPX_Buf dPhiMins_buf; + FPX_Buf dPhiMaxs_buf; + FPX_Buf dPhiChanges_buf; + FPX_Buf dPhiChangeMins_buf; + FPX_Buf dPhiChangeMaxs_buf; + + uint16_t_Buf innerLowerModuleIndices_buf; + uint16_t_Buf outerLowerModuleIndices_buf; + + uint_Buf seedIdx_buf; + uint_Buf mdIndices_buf; + uint_Buf innerMiniDoubletAnchorHitIndices_buf; + uint_Buf outerMiniDoubletAnchorHitIndices_buf; + uint_Buf nMemoryLocations_buf; + + int_Buf nSegments_buf; + int_Buf totOccupancySegments_buf; + int_Buf charge_buf; + int_Buf superbin_buf; + + uint4_Buf pLSHitsIdxs_buf; + + int8_t_Buf pixelType_buf; + + char_Buf isQuad_buf; + + bool_Buf isDup_buf; + bool_Buf partOfPT5_buf; + + float_Buf ptIn_buf; + float_Buf ptErr_buf; + float_Buf px_buf; + float_Buf py_buf; + float_Buf pz_buf; + float_Buf etaErr_buf; + float_Buf eta_buf; + float_Buf phi_buf; + float_Buf score_buf; + float_Buf circleCenterX_buf; + float_Buf circleCenterY_buf; + float_Buf circleRadius_buf; + + // Pointers towards the data of each buffer + FPX* dPhis; + FPX* dPhiMins; + FPX* dPhiMaxs; + FPX* dPhiChanges; + FPX* dPhiChangeMins; + FPX* dPhiChangeMaxs; - unsigned int* mdIndices; uint16_t* innerLowerModuleIndices; uint16_t* outerLowerModuleIndices; + + unsigned int* mdIndices; + unsigned int* nMemoryLocations; unsigned int* innerMiniDoubletAnchorHitIndices; unsigned int* outerMiniDoubletAnchorHitIndices; - + int* nSegments; //number of segments per inner lower module int* totOccupancySegments; //number of segments per inner lower module - FPX* dPhis; - FPX* dPhiMins; - FPX* dPhiMaxs; - FPX* dPhiChanges; - FPX* dPhiChangeMins; - FPX* dPhiChangeMaxs; - //not so optional pixel dudes float* ptIn; float* ptErr; float* px; @@ -51,15 +98,91 @@ namespace SDL bool* partOfPT5; uint4* pLSHitsIdxs; - segments(); - ~segments(); - - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + segments(unsigned int nMemoryLocationsIn, + uint16_t nLowerModules, + unsigned int maxPixelSegments, + TAcc const & devAcc, + TQueue& queue) : + mdIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn*2)), + innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + innerLowerModuleIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + outerLowerModuleIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + nSegments_buf(allocBufWrapper(devAcc, nLowerModules + 1)), + totOccupancySegments_buf(allocBufWrapper(devAcc, nLowerModules + 1)), + dPhis_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + dPhiMins_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + dPhiMaxs_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + dPhiChanges_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + dPhiChangeMins_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + dPhiChangeMaxs_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), + ptIn_buf(allocBufWrapper(devAcc, maxPixelSegments)), + ptErr_buf(allocBufWrapper(devAcc, maxPixelSegments)), + px_buf(allocBufWrapper(devAcc, maxPixelSegments)), + py_buf(allocBufWrapper(devAcc, maxPixelSegments)), + pz_buf(allocBufWrapper(devAcc, maxPixelSegments)), + etaErr_buf(allocBufWrapper(devAcc, maxPixelSegments)), + eta_buf(allocBufWrapper(devAcc, maxPixelSegments)), + phi_buf(allocBufWrapper(devAcc, maxPixelSegments)), + superbin_buf(allocBufWrapper(devAcc, maxPixelSegments)), + pixelType_buf(allocBufWrapper(devAcc, maxPixelSegments)), + isQuad_buf(allocBufWrapper(devAcc, maxPixelSegments)), + isDup_buf(allocBufWrapper(devAcc, maxPixelSegments)), + score_buf(allocBufWrapper(devAcc, maxPixelSegments)), + charge_buf(allocBufWrapper(devAcc, maxPixelSegments)), + seedIdx_buf(allocBufWrapper(devAcc, maxPixelSegments)), + circleCenterX_buf(allocBufWrapper(devAcc, maxPixelSegments)), + circleCenterY_buf(allocBufWrapper(devAcc, maxPixelSegments)), + circleRadius_buf(allocBufWrapper(devAcc, maxPixelSegments)), + partOfPT5_buf(allocBufWrapper(devAcc, maxPixelSegments)), + pLSHitsIdxs_buf(allocBufWrapper(devAcc, maxPixelSegments)), + nMemoryLocations_buf(allocBufWrapper(devAcc, 1)) + { + mdIndices = alpaka::getPtrNative(mdIndices_buf); + innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf); + outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf); + innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf); + outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf); + nSegments = alpaka::getPtrNative(nSegments_buf); + totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf); + dPhis = alpaka::getPtrNative(dPhis_buf); + dPhiMins = alpaka::getPtrNative(dPhiMins_buf); + dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf); + dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf); + dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf); + dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf); + ptIn = alpaka::getPtrNative(ptIn_buf); + ptErr = alpaka::getPtrNative(ptErr_buf); + px = alpaka::getPtrNative(px_buf); + py = alpaka::getPtrNative(py_buf); + pz = alpaka::getPtrNative(pz_buf); + etaErr = alpaka::getPtrNative(etaErr_buf); + eta = alpaka::getPtrNative(eta_buf); + phi = alpaka::getPtrNative(phi_buf); + superbin = alpaka::getPtrNative(superbin_buf); + pixelType = alpaka::getPtrNative(pixelType_buf); + isQuad = alpaka::getPtrNative(isQuad_buf); + isDup = alpaka::getPtrNative(isDup_buf); + score = alpaka::getPtrNative(score_buf); + charge = alpaka::getPtrNative(charge_buf); + seedIdx = alpaka::getPtrNative(seedIdx_buf); + circleCenterX = alpaka::getPtrNative(circleCenterX_buf); + circleCenterY = alpaka::getPtrNative(circleCenterY_buf); + circleRadius = alpaka::getPtrNative(circleRadius_buf); + partOfPT5 = alpaka::getPtrNative(partOfPT5_buf); + pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf); + nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf); + + alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1); + alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1); + alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments); + alpaka::memset(queue, pLSHitsIdxs_buf, 0u, maxPixelSegments); + alpaka::memset(queue, nMemoryLocations_buf, nMemoryLocationsIn, 1); + alpaka::wait(queue); + } }; - void createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int maxSegments, uint16_t nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(struct modules& modulesInGPU, unsigned int moduleIndex) { // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing @@ -563,8 +686,6 @@ namespace SDL } }; - void printSegment(struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int segmentIndex); - struct createSegmentsInGPUv2 { template From 8ef1c923422563a607f822a459545ea63f623c02 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 15:21:23 -0400 Subject: [PATCH 03/44] temporary fix for ntuple writing --- SDL/Event.cu | 8 +++---- SDL/Event.cuh | 4 ++-- SDL/Segment.cuh | 43 +++++++++++++++++++++++++++++++++++ code/core/AccessHelper.cc | 4 ++-- code/core/write_sdl_ntuple.cc | 20 ++++++++-------- 5 files changed, 61 insertions(+), 18 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index a46c1a9a..aba4381e 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -2114,15 +2114,15 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets() return mdsInCPU; } -SDL::segments* SDL::Event::getSegments() +SDL::segments_temp* SDL::Event::getSegments() { if(segmentsInCPU == nullptr) { - segmentsInCPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); - + segmentsInCPU = new SDL::segments_temp; + segmentsInCPU->nSegments = new int[nLowerModules+1]; cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream); - + segmentsInCPU->nMemoryLocations = new unsigned int; cudaMemcpyAsync(segmentsInCPU->nMemoryLocations, segmentsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 52e14448..38e6e38e 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -53,7 +53,7 @@ namespace SDL objectRanges* rangesInCPU; hits* hitsInCPU; miniDoublets* mdsInCPU; - segments* segmentsInCPU; + segments_temp* segmentsInCPU; triplets* tripletsInCPU; trackCandidates* trackCandidatesInCPU; modules* modulesInCPU; @@ -133,7 +133,7 @@ namespace SDL hits* getHits(); hits* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); - segments* getSegments() ; + segments_temp* getSegments() ; triplets* getTriplets(); quintuplets* getQuintuplets(); trackCandidates* getTrackCandidates(); diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index ff789a16..020ea55a 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -12,6 +12,49 @@ using uint4_Buf = alpaka::Buf; namespace SDL { + // Temporary struct to handle ntuple writing + struct segments_temp + { + unsigned int* nMemoryLocations; + + unsigned int* mdIndices; + uint16_t* innerLowerModuleIndices; + uint16_t* outerLowerModuleIndices; + unsigned int* innerMiniDoubletAnchorHitIndices; + unsigned int* outerMiniDoubletAnchorHitIndices; + + int* nSegments; //number of segments per inner lower module + int* totOccupancySegments; //number of segments per inner lower module + FPX* dPhis; + FPX* dPhiMins; + FPX* dPhiMaxs; + FPX* dPhiChanges; + FPX* dPhiChangeMins; + FPX* dPhiChangeMaxs; + + //not so optional pixel dudes + float* ptIn; + float* ptErr; + float* px; + float* py; + float* pz; + float* etaErr; + float* eta; + float* phi; + int* charge; + unsigned int* seedIdx; + int* superbin; + int8_t* pixelType; + char* isQuad; + bool* isDup; + float* score; + float* circleCenterX; + float* circleCenterY; + float* circleRadius; + bool* partOfPT5; + uint4* pLSHitsIdxs; + }; + struct segments { // Buffer objects for each member variable diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 2c19eb84..c721316c 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -28,7 +28,7 @@ std::tuple, std::vector> convertHitsToHi //____________________________________________________________________________________________ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS) { - SDL::segments& segments_ = *(event->getSegments()); + SDL::segments_temp& segments_ = *(event->getSegments()); SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -96,7 +96,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getMDsFromLS(SDL::Event* event, unsigned int LS) { - SDL::segments& segments_ = *(event->getSegments()); + SDL::segments_temp& segments_ = *(event->getSegments()); unsigned int MD_1 = segments_.mdIndices[2 * LS]; unsigned int MD_2 = segments_.mdIndices[2 * LS + 1]; return {MD_1, MD_2}; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 824dd9db..da7ff47f 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) // ============ pT5 ============= SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::modules& modulesInGPU = (*event->getModules()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -476,7 +476,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets()); SDL::triplets& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); - SDL::segments& segmentsInGPU = *(event->getSegments()); + SDL::segments_temp& segmentsInGPU = *(event->getSegments()); SDL::hits& hitsInGPU = *(event->getHits()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -559,7 +559,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) void setGnnNtupleBranches(SDL::Event* event) { // Get relevant information - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -821,7 +821,7 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::hits& hitsInGPU = (*event->getHits()); // @@ -959,7 +959,7 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::hits& hitsInGPU = (*event->getHits()); // @@ -1059,7 +1059,7 @@ std::tuple, vector> pars std::tuple, vector> parsepLS(SDL::Event* event, unsigned int idx) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); // Getting pLS index unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx]; @@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printLSs(SDL::Event* event) { - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printpLSs(SDL::Event* event) { - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1238,7 +1238,7 @@ void printpLSs(SDL::Event* event) void printT3s(SDL::Event* event) { SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segments_temp& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); //SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); From 11372df3ca56f89e5cc4fe704485c47705fa66c0 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 15:53:00 -0400 Subject: [PATCH 04/44] remove unused print util files --- SDL/Hit.cuh | 1 - SDL/MiniDoublet.cu | 25 ------------------------- SDL/MiniDoublet.cuh | 2 -- code/core/Hit.cc | 10 ---------- code/core/Hit.h | 1 - {SDL => cpu}/PrintUtil.cc | 0 {SDL => cpu}/PrintUtil.h | 0 7 files changed, 39 deletions(-) rename {SDL => cpu}/PrintUtil.cc (100%) rename {SDL => cpu}/PrintUtil.h (100%) diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh index fe631b9e..12fc7bf2 100644 --- a/SDL/Hit.cuh +++ b/SDL/Hit.cuh @@ -6,7 +6,6 @@ #include "Constants.cuh" #include "Module.cuh" #include "allocate.h" -#include "PrintUtil.h" namespace SDL { diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu index 3cfab8dd..3fd6d23a 100644 --- a/SDL/MiniDoublet.cu +++ b/SDL/MiniDoublet.cu @@ -137,28 +137,3 @@ void SDL::miniDoublets::freeMemory(cudaStream_t stream) cudaFree(outerHighEdgeX); cudaFree(nMemoryLocations); } - -void SDL::printMD(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, SDL::modules& modulesInGPU, unsigned int mdIndex) -{ - std::cout< #include "MathUtil.h" -#include "PrintUtil.h" namespace SDL { diff --git a/SDL/PrintUtil.cc b/cpu/PrintUtil.cc similarity index 100% rename from SDL/PrintUtil.cc rename to cpu/PrintUtil.cc diff --git a/SDL/PrintUtil.h b/cpu/PrintUtil.h similarity index 100% rename from SDL/PrintUtil.h rename to cpu/PrintUtil.h From b937556b9920bf19c9671c87f764b9cae778a590 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 18:07:05 -0400 Subject: [PATCH 05/44] move remaining buffers to wrapper --- SDL/Event.cu | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index aba4381e..e17c19b0 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -741,7 +741,7 @@ struct addPixelSegmentToEventKernel void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,std::vector hitIndices1,std::vector hitIndices2,std::vector hitIndices3, std::vector dPhiChange, std::vector ptIn, std::vector ptErr, std::vector px, std::vector py, std::vector pz, std::vector eta, std::vector etaErr, std::vector phi, std::vector charge, std::vector seedIdx, std::vector superbin, std::vector pixelType, std::vector isQuad) { - int size = ptIn.size(); + const int size = ptIn.size(); unsigned int mdSize = 2 * size; uint16_t pixelModuleIndex = (*detIdToIndex)[1]; @@ -804,13 +804,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st cudaStreamSynchronize(stream); } - alpaka::Vec const extent(static_cast(size)); - - auto hitIndices0_dev = alpaka::allocBuf(devAcc, extent); - auto hitIndices1_dev = alpaka::allocBuf(devAcc, extent); - auto hitIndices2_dev = alpaka::allocBuf(devAcc, extent); - auto hitIndices3_dev = alpaka::allocBuf(devAcc, extent); - auto dPhiChange_dev = alpaka::allocBuf(devAcc, extent); + auto hitIndices0_dev = allocBufWrapper(devAcc, size); + auto hitIndices1_dev = allocBufWrapper(devAcc, size); + auto hitIndices2_dev = allocBufWrapper(devAcc, size); + auto hitIndices3_dev = allocBufWrapper(devAcc, size); + auto dPhiChange_dev = allocBufWrapper(devAcc, size); alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size); alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size); @@ -2118,6 +2116,7 @@ SDL::segments_temp* SDL::Event::getSegments() { if(segmentsInCPU == nullptr) { + std::cout << "run" << std::endl; segmentsInCPU = new SDL::segments_temp; segmentsInCPU->nSegments = new int[nLowerModules+1]; From a68cdb91fc510f3fbb96ecc07a08f11504612432 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 18:10:38 -0400 Subject: [PATCH 06/44] debug cleanup --- SDL/Event.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index e17c19b0..df31dbed 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -2116,7 +2116,6 @@ SDL::segments_temp* SDL::Event::getSegments() { if(segmentsInCPU == nullptr) { - std::cout << "run" << std::endl; segmentsInCPU = new SDL::segments_temp; segmentsInCPU->nSegments = new int[nLowerModules+1]; From 0286d4340b9973bdd391c36bef84f8ca27e9567a Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 18:13:40 -0400 Subject: [PATCH 07/44] more debug removal --- Makefile | 2 +- SDL/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e4f18272..a49be49e 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ ROOTLIBS = $(shell root-config --libs) ROOTCFLAGS = $(foreach option, $(shell root-config --cflags), --compiler-options $(option)) ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr -DALPAKA_DEBUG=0 -CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O0 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include +CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include EXTRACFLAGS = $(shell rooutil-config) EXTRAFLAGS = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET diff --git a/SDL/Makefile b/SDL/Makefile index c518ee68..abc9a160 100644 --- a/SDL/Makefile +++ b/SDL/Makefile @@ -45,7 +45,7 @@ CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG $(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@ %_cpu.o : %.cc %.h - $(LD) -O0 $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ + $(LD) -O2 $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ $(LIB):$(CCOBJECTS) $(CUOBJECTS) #$(LIB):$(CUOBJECTS) From fbccc37912a1e850135d75ba542f585f995cb40f Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 31 May 2023 19:45:45 -0400 Subject: [PATCH 08/44] generalize to host allocations --- SDL/Constants.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index b45e45e4..8ce135d7 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -72,6 +72,7 @@ using WorkDiv = alpaka::WorkDivMembers; using Acc = alpaka::AccCpuSerial; #endif +auto const devHost = alpaka::getDevByIdx(0u); auto const devAcc = alpaka::getDevByIdx(0u); using QueueAcc = alpaka::Queue; @@ -91,7 +92,7 @@ using FPX_circle_Buf = alpaka::Buf; using FPX_seg_Buf = alpaka::Buf; template -alpaka::Buf inline allocBufWrapper(TAcc const & devAcc, TSize nElements) { +alpaka::Buf inline allocBufWrapper(TAcc const & devAcc, TSize nElements) { return alpaka::allocBuf(devAcc, alpaka::Vec(static_cast(nElements))); } From 9a61b02ebf280a45d3a4af8586c2477a2280d382 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 1 Jun 2023 11:30:26 -0400 Subject: [PATCH 09/44] templated buffer type --- SDL/Constants.cuh | 17 ++-------- SDL/Segment.cuh | 86 +++++++++++++++++++++++------------------------ 2 files changed, 45 insertions(+), 58 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 8ce135d7..07a6ba79 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -76,20 +76,9 @@ auto const devHost = alpaka::getDevByIdx(0u); auto const devAcc = alpaka::getDevByIdx(0u); using QueueAcc = alpaka::Queue; -// Typical Buffer types used in the code. -using float_Buf = alpaka::Buf; -using int_Buf = alpaka::Buf; -using uint_Buf = alpaka::Buf; -using int8_t_Buf = alpaka::Buf; -using uint16_t_Buf = alpaka::Buf; -using char_Buf = alpaka::Buf; -using bool_Buf = alpaka::Buf; - -using FPX_Buf = alpaka::Buf; -using FPX_T5_Buf = alpaka::Buf; -using FPX_dPhi_Buf = alpaka::Buf; -using FPX_circle_Buf = alpaka::Buf; -using FPX_seg_Buf = alpaka::Buf; +// Buffer type for allocations where auto type can't be used. +template +using Buf = alpaka::Buf; template alpaka::Buf inline allocBufWrapper(TAcc const & devAcc, TSize nElements) { diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 020ea55a..2dd34f0e 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -8,8 +8,6 @@ #include "Module.cuh" #include "Hit.cuh" -using uint4_Buf = alpaka::Buf; - namespace SDL { // Temporary struct to handle ntuple writing @@ -58,48 +56,48 @@ namespace SDL struct segments { // Buffer objects for each member variable - FPX_Buf dPhis_buf; - FPX_Buf dPhiMins_buf; - FPX_Buf dPhiMaxs_buf; - FPX_Buf dPhiChanges_buf; - FPX_Buf dPhiChangeMins_buf; - FPX_Buf dPhiChangeMaxs_buf; - - uint16_t_Buf innerLowerModuleIndices_buf; - uint16_t_Buf outerLowerModuleIndices_buf; - - uint_Buf seedIdx_buf; - uint_Buf mdIndices_buf; - uint_Buf innerMiniDoubletAnchorHitIndices_buf; - uint_Buf outerMiniDoubletAnchorHitIndices_buf; - uint_Buf nMemoryLocations_buf; - - int_Buf nSegments_buf; - int_Buf totOccupancySegments_buf; - int_Buf charge_buf; - int_Buf superbin_buf; - - uint4_Buf pLSHitsIdxs_buf; - - int8_t_Buf pixelType_buf; - - char_Buf isQuad_buf; - - bool_Buf isDup_buf; - bool_Buf partOfPT5_buf; - - float_Buf ptIn_buf; - float_Buf ptErr_buf; - float_Buf px_buf; - float_Buf py_buf; - float_Buf pz_buf; - float_Buf etaErr_buf; - float_Buf eta_buf; - float_Buf phi_buf; - float_Buf score_buf; - float_Buf circleCenterX_buf; - float_Buf circleCenterY_buf; - float_Buf circleRadius_buf; + Buf dPhis_buf; + Buf dPhiMins_buf; + Buf dPhiMaxs_buf; + Buf dPhiChanges_buf; + Buf dPhiChangeMins_buf; + Buf dPhiChangeMaxs_buf; + + Buf innerLowerModuleIndices_buf; + Buf outerLowerModuleIndices_buf; + + Buf seedIdx_buf; + Buf mdIndices_buf; + Buf innerMiniDoubletAnchorHitIndices_buf; + Buf outerMiniDoubletAnchorHitIndices_buf; + Buf nMemoryLocations_buf; + + Buf nSegments_buf; + Buf totOccupancySegments_buf; + Buf charge_buf; + Buf superbin_buf; + + Buf pLSHitsIdxs_buf; // Please ensure that the 'uint4' type is defined and available in your scope. + + Buf pixelType_buf; + + Buf isQuad_buf; + + Buf isDup_buf; + Buf partOfPT5_buf; + + Buf ptIn_buf; + Buf ptErr_buf; + Buf px_buf; + Buf py_buf; + Buf pz_buf; + Buf etaErr_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_buf; + Buf circleCenterX_buf; + Buf circleCenterY_buf; + Buf circleRadius_buf; // Pointers towards the data of each buffer FPX* dPhis; From 96d3b8b1815d65c624ddd600d8f9ae508f9afed2 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 1 Jun 2023 15:33:57 -0400 Subject: [PATCH 10/44] working ntuple writing with templated segments --- SDL/Constants.cuh | 4 +- SDL/Event.cu | 79 ++++--------- SDL/Event.cuh | 6 +- SDL/Kernels.cuh | 5 +- SDL/PixelTriplet.cuh | 26 +++-- SDL/Quintuplet.cuh | 15 +-- SDL/Segment.cuh | 209 ++++++++++++++-------------------- SDL/TrackCandidate.cuh | 13 ++- SDL/Triplet.cuh | 33 +++--- code/core/AccessHelper.cc | 4 +- code/core/write_sdl_ntuple.cc | 20 ++-- 11 files changed, 176 insertions(+), 238 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 07a6ba79..007e022c 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -81,8 +81,8 @@ template using Buf = alpaka::Buf; template -alpaka::Buf inline allocBufWrapper(TAcc const & devAcc, TSize nElements) { - return alpaka::allocBuf(devAcc, alpaka::Vec(static_cast(nElements))); +Buf inline allocBufWrapper(TAcc const & devAccIn, TSize nElements) { + return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); } const unsigned int MAX_BLOCKS = 80; diff --git a/SDL/Event.cu b/SDL/Event.cu index df31dbed..9db88f40 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -118,15 +118,6 @@ SDL::Event::~Event() if(segmentsInCPU != nullptr) { - delete[] segmentsInCPU->mdIndices; - delete[] segmentsInCPU->nSegments; - delete[] segmentsInCPU->totOccupancySegments; - delete[] segmentsInCPU->innerMiniDoubletAnchorHitIndices; - delete[] segmentsInCPU->outerMiniDoubletAnchorHitIndices; - delete[] segmentsInCPU->ptIn; - delete[] segmentsInCPU->eta; - delete[] segmentsInCPU->phi; - delete segmentsInCPU->nMemoryLocations; delete segmentsInCPU; } @@ -345,14 +336,6 @@ void SDL::Event::resetEvent() if(segmentsInCPU != nullptr) { - delete[] segmentsInCPU->mdIndices; - delete[] segmentsInCPU->nSegments; - delete[] segmentsInCPU->totOccupancySegments; - delete[] segmentsInCPU->innerMiniDoubletAnchorHitIndices; - delete[] segmentsInCPU->outerMiniDoubletAnchorHitIndices; - delete[] segmentsInCPU->ptIn; - delete[] segmentsInCPU->eta; - delete[] segmentsInCPU->phi; delete segmentsInCPU; segmentsInCPU = nullptr; } @@ -698,7 +681,7 @@ struct addPixelSegmentToEventKernel struct SDL::objectRanges& rangesInGPU, struct SDL::hits& hitsInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, unsigned int* hitIndices0, unsigned int* hitIndices1, unsigned int* hitIndices2, @@ -798,7 +781,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st cudaStreamSynchronize(stream); nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE; - segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);; cudaStreamSynchronize(stream); @@ -1043,7 +1026,7 @@ void SDL::Event::createSegmentsWithModuleMap() { if(segmentsInGPU == nullptr) { - segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); } Vec const threadsPerBlockCreateSeg(static_cast(1), static_cast(1), static_cast(64)); @@ -2112,44 +2095,32 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets() return mdsInCPU; } -SDL::segments_temp* SDL::Event::getSegments() +SDL::segments* SDL::Event::getSegments() { if(segmentsInCPU == nullptr) { - segmentsInCPU = new SDL::segments_temp; - - segmentsInCPU->nSegments = new int[nLowerModules+1]; - cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream); - - segmentsInCPU->nMemoryLocations = new unsigned int; - cudaMemcpyAsync(segmentsInCPU->nMemoryLocations, segmentsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); + // Get nMemoryLocations parameter to initilize host based segmentsInCPU + auto nMemLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nMemLocal_buf, segmentsInGPU->nMemoryLocations_buf, 1); + alpaka::wait(queue); - segmentsInCPU->mdIndices = new unsigned int[2 * *(segmentsInCPU->nMemoryLocations)]; - segmentsInCPU->innerMiniDoubletAnchorHitIndices = new unsigned int[*(segmentsInCPU->nMemoryLocations)]; - segmentsInCPU->outerMiniDoubletAnchorHitIndices = new unsigned int[*(segmentsInCPU->nMemoryLocations)]; - segmentsInCPU->totOccupancySegments = new int[nLowerModules+1]; - - segmentsInCPU->ptIn = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->eta = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->phi = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->seedIdx = new unsigned int[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->isDup = new bool[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->isQuad = new char[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - segmentsInCPU->score = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE]; - - cudaMemcpyAsync(segmentsInCPU->mdIndices, segmentsInGPU->mdIndices, 2 * *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->innerMiniDoubletAnchorHitIndices, segmentsInGPU->innerMiniDoubletAnchorHitIndices, *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->outerMiniDoubletAnchorHitIndices, segmentsInGPU->outerMiniDoubletAnchorHitIndices, *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->totOccupancySegments, segmentsInGPU->totOccupancySegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->ptIn, segmentsInGPU->ptIn, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->eta, segmentsInGPU->eta, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->phi, segmentsInGPU->phi, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->seedIdx, segmentsInGPU->seedIdx, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->isDup, segmentsInGPU->isDup, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(bool), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->isQuad, segmentsInGPU->isQuad, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(char), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(segmentsInCPU->score, segmentsInGPU->score, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); + segmentsInCPU = new SDL::segments(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue); + + *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemLocal; + alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsInGPU->nSegments_buf, (nLowerModules+1)); + alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsInGPU->mdIndices_buf, 2 * nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->innerMiniDoubletAnchorHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->outerMiniDoubletAnchorHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsInGPU->totOccupancySegments_buf, (nLowerModules+1)); + alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsInGPU->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsInGPU->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsInGPU->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsInGPU->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsInGPU->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsInGPU->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsInGPU->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::wait(queue); } return segmentsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 38e6e38e..f5b8327c 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -42,7 +42,7 @@ namespace SDL struct objectRanges* rangesInGPU; struct hits* hitsInGPU; struct miniDoublets* mdsInGPU; - struct segments* segmentsInGPU; + struct segments* segmentsInGPU; struct triplets* tripletsInGPU; struct quintuplets* quintupletsInGPU; struct trackCandidates* trackCandidatesInGPU; @@ -53,7 +53,7 @@ namespace SDL objectRanges* rangesInCPU; hits* hitsInCPU; miniDoublets* mdsInCPU; - segments_temp* segmentsInCPU; + segments* segmentsInCPU; triplets* tripletsInCPU; trackCandidates* trackCandidatesInCPU; modules* modulesInCPU; @@ -133,7 +133,7 @@ namespace SDL hits* getHits(); hits* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); - segments_temp* getSegments() ; + segments* getSegments() ; triplets* getTriplets(); quintuplets* getQuintuplets(); trackCandidates* getTrackCandidates(); diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index 8fd7d952..068e66f6 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -27,7 +27,8 @@ namespace SDL pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = 1; }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) { segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1; }; @@ -452,7 +453,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, bool secondpass) const { using Dim = alpaka::Dim; diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 0f884bae..d8d37fef 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -48,7 +48,8 @@ namespace SDL void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) { pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex; pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex; @@ -130,7 +131,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; @@ -663,7 +664,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) { bool pass = true; @@ -768,7 +769,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, unsigned int* connectedPixelSize, @@ -911,7 +912,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -1124,7 +1125,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -1385,7 +1386,8 @@ namespace SDL void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) { pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex; pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index; @@ -1966,7 +1968,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) { bool pass = true; @@ -2100,7 +2102,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, @@ -2226,7 +2228,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -2433,7 +2435,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -2648,7 +2650,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index e6384152..1a34d763 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -644,7 +644,8 @@ namespace SDL return true; }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) { unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex]; @@ -1204,7 +1205,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -1397,7 +1398,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -1608,7 +1609,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1815,7 +1816,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1873,7 +1874,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) { bool pass = true; unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex]; @@ -2077,7 +2078,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::objectRanges& rangesInGPU, diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 2dd34f0e..f7e1104e 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -10,94 +10,52 @@ namespace SDL { - // Temporary struct to handle ntuple writing - struct segments_temp - { - unsigned int* nMemoryLocations; - - unsigned int* mdIndices; - uint16_t* innerLowerModuleIndices; - uint16_t* outerLowerModuleIndices; - unsigned int* innerMiniDoubletAnchorHitIndices; - unsigned int* outerMiniDoubletAnchorHitIndices; - - int* nSegments; //number of segments per inner lower module - int* totOccupancySegments; //number of segments per inner lower module - FPX* dPhis; - FPX* dPhiMins; - FPX* dPhiMaxs; - FPX* dPhiChanges; - FPX* dPhiChangeMins; - FPX* dPhiChangeMaxs; - - //not so optional pixel dudes - float* ptIn; - float* ptErr; - float* px; - float* py; - float* pz; - float* etaErr; - float* eta; - float* phi; - int* charge; - unsigned int* seedIdx; - int* superbin; - int8_t* pixelType; - char* isQuad; - bool* isDup; - float* score; - float* circleCenterX; - float* circleCenterY; - float* circleRadius; - bool* partOfPT5; - uint4* pLSHitsIdxs; - }; - + template struct segments { // Buffer objects for each member variable - Buf dPhis_buf; - Buf dPhiMins_buf; - Buf dPhiMaxs_buf; - Buf dPhiChanges_buf; - Buf dPhiChangeMins_buf; - Buf dPhiChangeMaxs_buf; - - Buf innerLowerModuleIndices_buf; - Buf outerLowerModuleIndices_buf; - - Buf seedIdx_buf; - Buf mdIndices_buf; - Buf innerMiniDoubletAnchorHitIndices_buf; - Buf outerMiniDoubletAnchorHitIndices_buf; - Buf nMemoryLocations_buf; - - Buf nSegments_buf; - Buf totOccupancySegments_buf; - Buf charge_buf; - Buf superbin_buf; - - Buf pLSHitsIdxs_buf; // Please ensure that the 'uint4' type is defined and available in your scope. - - Buf pixelType_buf; - - Buf isQuad_buf; - - Buf isDup_buf; - Buf partOfPT5_buf; - - Buf ptIn_buf; - Buf ptErr_buf; - Buf px_buf; - Buf py_buf; - Buf pz_buf; - Buf etaErr_buf; - Buf eta_buf; - Buf phi_buf; - Buf score_buf; - Buf circleCenterX_buf; - Buf circleCenterY_buf; - Buf circleRadius_buf; + Buf dPhis_buf; + Buf dPhiMins_buf; + Buf dPhiMaxs_buf; + Buf dPhiChanges_buf; + Buf dPhiChangeMins_buf; + Buf dPhiChangeMaxs_buf; + + Buf innerLowerModuleIndices_buf; + Buf outerLowerModuleIndices_buf; + + Buf seedIdx_buf; + Buf mdIndices_buf; + Buf innerMiniDoubletAnchorHitIndices_buf; + Buf outerMiniDoubletAnchorHitIndices_buf; + Buf nMemoryLocations_buf; + + Buf nSegments_buf; + Buf totOccupancySegments_buf; + Buf charge_buf; + Buf superbin_buf; + + Buf pLSHitsIdxs_buf; + + Buf pixelType_buf; + + Buf isQuad_buf; + + Buf isDup_buf; + Buf partOfPT5_buf; + + Buf ptIn_buf; + Buf ptErr_buf; + Buf px_buf; + Buf py_buf; + Buf pz_buf; + Buf etaErr_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_buf; + Buf circleCenterX_buf; + Buf circleCenterY_buf; + Buf circleRadius_buf; // Pointers towards the data of each buffer FPX* dPhis; @@ -139,46 +97,46 @@ namespace SDL bool* partOfPT5; uint4* pLSHitsIdxs; - template + template segments(unsigned int nMemoryLocationsIn, uint16_t nLowerModules, unsigned int maxPixelSegments, - TAcc const & devAcc, + TDevAcc const & devAccIn, TQueue& queue) : - mdIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn*2)), - innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - innerLowerModuleIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - outerLowerModuleIndices_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - nSegments_buf(allocBufWrapper(devAcc, nLowerModules + 1)), - totOccupancySegments_buf(allocBufWrapper(devAcc, nLowerModules + 1)), - dPhis_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - dPhiMins_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - dPhiMaxs_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - dPhiChanges_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - dPhiChangeMins_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - dPhiChangeMaxs_buf(allocBufWrapper(devAcc, nMemoryLocationsIn)), - ptIn_buf(allocBufWrapper(devAcc, maxPixelSegments)), - ptErr_buf(allocBufWrapper(devAcc, maxPixelSegments)), - px_buf(allocBufWrapper(devAcc, maxPixelSegments)), - py_buf(allocBufWrapper(devAcc, maxPixelSegments)), - pz_buf(allocBufWrapper(devAcc, maxPixelSegments)), - etaErr_buf(allocBufWrapper(devAcc, maxPixelSegments)), - eta_buf(allocBufWrapper(devAcc, maxPixelSegments)), - phi_buf(allocBufWrapper(devAcc, maxPixelSegments)), - superbin_buf(allocBufWrapper(devAcc, maxPixelSegments)), - pixelType_buf(allocBufWrapper(devAcc, maxPixelSegments)), - isQuad_buf(allocBufWrapper(devAcc, maxPixelSegments)), - isDup_buf(allocBufWrapper(devAcc, maxPixelSegments)), - score_buf(allocBufWrapper(devAcc, maxPixelSegments)), - charge_buf(allocBufWrapper(devAcc, maxPixelSegments)), - seedIdx_buf(allocBufWrapper(devAcc, maxPixelSegments)), - circleCenterX_buf(allocBufWrapper(devAcc, maxPixelSegments)), - circleCenterY_buf(allocBufWrapper(devAcc, maxPixelSegments)), - circleRadius_buf(allocBufWrapper(devAcc, maxPixelSegments)), - partOfPT5_buf(allocBufWrapper(devAcc, maxPixelSegments)), - pLSHitsIdxs_buf(allocBufWrapper(devAcc, maxPixelSegments)), - nMemoryLocations_buf(allocBufWrapper(devAcc, 1)) + mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn*2)), + innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), + totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), + dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + dPhiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + dPhiChangeMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + dPhiChangeMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + ptIn_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + ptErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + px_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + py_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + pz_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + etaErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + eta_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + phi_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + score_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + charge_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + circleCenterX_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)) { mdIndices = alpaka::getPtrNative(mdIndices_buf); innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf); @@ -481,7 +439,8 @@ namespace SDL dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) { //idx will be computed in the kernel, which is the index into which the //segment will be written @@ -503,7 +462,7 @@ namespace SDL } template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) { segmentsInGPU.mdIndices[idx * 2] = innerMDIndex; segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex; @@ -734,7 +693,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -886,7 +845,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct modules& modulesInGPU, - struct segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index d81a570d..2e48c48c 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -81,7 +81,8 @@ namespace SDL trackCandidatesInGPU.radius[trackCandidateIndex] = __F2H(radius); }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) { int phits1[4] = {-1,-1,-1,-1}; int phits2[4] = {-1,-1,-1,-1}; @@ -127,7 +128,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const { using Dim = alpaka::Dim; @@ -242,7 +243,7 @@ namespace SDL struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct SDL::quintuplets& quintupletsInGPU) const @@ -327,7 +328,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -398,7 +399,7 @@ namespace SDL TAcc const & acc, uint16_t nLowerModules, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU) const + SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; @@ -428,7 +429,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 16ea085d..c548ee21 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -59,9 +59,11 @@ namespace SDL void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream); #ifdef CUT_VALUE_DEBUG - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) #else - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) #endif { tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex; @@ -108,7 +110,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) { //get the rt and z const float& r1 = mdsInGPU.anchorRt[firstMDIndex]; @@ -189,7 +191,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -248,7 +250,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex; @@ -327,7 +329,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -407,7 +409,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; @@ -442,7 +444,8 @@ namespace SDL return false; // failsafe }; - void printTriplet(struct triplets& tripletsInGPU, struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex); + template + void printTriplet(struct triplets& tripletsInGPU, SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex); template ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn) @@ -490,7 +493,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -686,7 +689,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -901,7 +904,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1110,7 +1113,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1175,7 +1178,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1202,7 +1205,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::objectRanges& rangesInGPU, uint16_t *index_gpu, @@ -1277,7 +1280,7 @@ namespace SDL TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, - struct segments& segmentsInGPU) const + SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index c721316c..2bf534b2 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -28,7 +28,7 @@ std::tuple, std::vector> convertHitsToHi //____________________________________________________________________________________________ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS) { - SDL::segments_temp& segments_ = *(event->getSegments()); + SDL::segments& segments_ = *(event->getSegments()); SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -96,7 +96,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getMDsFromLS(SDL::Event* event, unsigned int LS) { - SDL::segments_temp& segments_ = *(event->getSegments()); + SDL::segments& segments_ = *(event->getSegments()); unsigned int MD_1 = segments_.mdIndices[2 * LS]; unsigned int MD_2 = segments_.mdIndices[2 * LS + 1]; return {MD_1, MD_2}; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index da7ff47f..dcc9f070 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) // ============ pT5 ============= SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::modules& modulesInGPU = (*event->getModules()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -476,7 +476,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets()); SDL::triplets& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); - SDL::segments_temp& segmentsInGPU = *(event->getSegments()); + SDL::segments& segmentsInGPU = *(event->getSegments()); SDL::hits& hitsInGPU = *(event->getHits()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -559,7 +559,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) void setGnnNtupleBranches(SDL::Event* event) { // Get relevant information - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -821,7 +821,7 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::hits& hitsInGPU = (*event->getHits()); // @@ -959,7 +959,7 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::hits& hitsInGPU = (*event->getHits()); // @@ -1059,7 +1059,7 @@ std::tuple, vector> pars std::tuple, vector> parsepLS(SDL::Event* event, unsigned int idx) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); // Getting pLS index unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx]; @@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printLSs(SDL::Event* event) { - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printpLSs(SDL::Event* event) { - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1238,7 +1238,7 @@ void printpLSs(SDL::Event* event) void printT3s(SDL::Event* event) { SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments_temp& segmentsInGPU = (*event->getSegments()); + SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); //SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); From 7670310e9521af2c3fc3b03683a7bf6e88f2aced Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 1 Jun 2023 18:04:45 -0400 Subject: [PATCH 11/44] formatting fixes --- SDL/Constants.cuh | 2 +- SDL/Event.cu | 2 +- SDL/Kernels.cuh | 4 ++-- SDL/PixelTriplet.cuh | 24 ++++++++++++------------ SDL/Quintuplet.cuh | 14 +++++++------- SDL/Segment.cuh | 6 +++--- SDL/TrackCandidate.cuh | 12 ++++++------ SDL/Triplet.cuh | 30 +++++++++++++++--------------- 8 files changed, 47 insertions(+), 47 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 007e022c..7dfbbbe8 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -81,7 +81,7 @@ template using Buf = alpaka::Buf; template -Buf inline allocBufWrapper(TAcc const & devAccIn, TSize nElements) { +ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TAcc const & devAccIn, TSize nElements) { return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); } diff --git a/SDL/Event.cu b/SDL/Event.cu index 9db88f40..d7a89ec6 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -681,7 +681,7 @@ struct addPixelSegmentToEventKernel struct SDL::objectRanges& rangesInGPU, struct SDL::hits& hitsInGPU, struct SDL::miniDoublets& mdsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, unsigned int* hitIndices0, unsigned int* hitIndices1, unsigned int* hitIndices2, diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index 068e66f6..4cc1310d 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -28,7 +28,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) { segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1; }; @@ -453,7 +453,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, bool secondpass) const { using Dim = alpaka::Dim; diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index d8d37fef..f7aaa54e 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -49,7 +49,7 @@ namespace SDL void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) { pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex; pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex; @@ -131,7 +131,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; @@ -664,7 +664,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) { bool pass = true; @@ -769,7 +769,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::miniDoublets& mdsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, unsigned int* connectedPixelSize, @@ -912,7 +912,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -1125,7 +1125,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -1387,7 +1387,7 @@ namespace SDL void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream); template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) { pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex; pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index; @@ -1968,7 +1968,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) { bool pass = true; @@ -2102,7 +2102,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, @@ -2228,7 +2228,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -2435,7 +2435,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -2650,7 +2650,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index 1a34d763..b4fe6b3d 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -645,7 +645,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) { unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex]; @@ -1205,7 +1205,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -1398,7 +1398,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -1609,7 +1609,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1816,7 +1816,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1874,7 +1874,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) { bool pass = true; unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex]; @@ -2078,7 +2078,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::objectRanges& rangesInGPU, diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index f7e1104e..5eb74b30 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -440,7 +440,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) { //idx will be computed in the kernel, which is the index into which the //segment will be written @@ -462,7 +462,7 @@ namespace SDL } template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) { segmentsInGPU.mdIndices[idx * 2] = innerMDIndex; segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex; @@ -845,7 +845,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct modules& modulesInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index 2e48c48c..738037fa 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -82,7 +82,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) { int phits1[4] = {-1,-1,-1,-1}; int phits2[4] = {-1,-1,-1,-1}; @@ -128,7 +128,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const { using Dim = alpaka::Dim; @@ -243,7 +243,7 @@ namespace SDL struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct SDL::quintuplets& quintupletsInGPU) const @@ -328,7 +328,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -399,7 +399,7 @@ namespace SDL TAcc const & acc, uint16_t nLowerModules, struct SDL::trackCandidates& trackCandidatesInGPU, - SDL::segments& segmentsInGPU) const + struct SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; @@ -429,7 +429,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index c548ee21..5baa5a3f 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -60,10 +60,10 @@ namespace SDL #ifdef CUT_VALUE_DEBUG template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) #else template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) #endif { tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex; @@ -110,7 +110,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) { //get the rt and z const float& r1 = mdsInGPU.anchorRt[firstMDIndex]; @@ -191,7 +191,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -250,7 +250,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex; @@ -329,7 +329,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -409,7 +409,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; @@ -445,7 +445,7 @@ namespace SDL }; template - void printTriplet(struct triplets& tripletsInGPU, SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex); + void printTriplet(struct triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex); template ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn) @@ -493,7 +493,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -689,7 +689,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -904,7 +904,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1113,7 +1113,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1178,7 +1178,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1205,7 +1205,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::objectRanges& rangesInGPU, uint16_t *index_gpu, @@ -1280,7 +1280,7 @@ namespace SDL TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, - SDL::segments& segmentsInGPU) const + struct SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; From 955c3f027c16adf4856726873dc901487b27e2fe Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 1 Jun 2023 18:19:22 -0400 Subject: [PATCH 12/44] remove extra alpaka flags --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index a49be49e..44c520d9 100644 --- a/Makefile +++ b/Makefile @@ -20,10 +20,9 @@ LDFLAGS = -g -O2 ROOTLIBS = $(shell root-config --libs) ROOTCFLAGS = $(foreach option, $(shell root-config --cflags), --compiler-options $(option)) ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr -DALPAKA_DEBUG=0 -CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include +CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp EXTRACFLAGS = $(shell rooutil-config) -EXTRAFLAGS = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include +EXTRAFLAGS = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET PT0P8 = T3T3EXTENSION= From e4827f6a2bf4927c7b03202b47064e5920e5bc24 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 1 Jun 2023 18:51:13 -0400 Subject: [PATCH 13/44] move elementsPerThread to Constants.cuh --- SDL/Constants.cuh | 2 ++ SDL/Event.cu | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 7dfbbbe8..66a874a7 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -58,6 +58,8 @@ using Vec1d = alpaka::Vec; using QueueProperty = alpaka::NonBlocking; using WorkDiv = alpaka::WorkDivMembers; +Vec const elementsPerThread(Vec::all(static_cast(1))); + // - AccGpuCudaRt // - AccCpuThreads // - AccCpuFibers diff --git a/SDL/Event.cu b/SDL/Event.cu index d7a89ec6..ae1e9d00 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -5,9 +5,6 @@ struct SDL::pixelMap* SDL::pixelMapping = nullptr; uint16_t SDL::nModules; uint16_t SDL::nLowerModules; -// Temporary alpaka statements -Vec const elementsPerThread(Vec::all(static_cast(1))); - SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx(0u)) { int version; From 1813b982e08ea48aa6fe74df59cd41f658d8fd21 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sat, 3 Jun 2023 12:31:41 -0400 Subject: [PATCH 14/44] fix cuda Malloc/Free bug, formatting fixes --- SDL/Event.cu | 2 +- SDL/Segment.cuh | 100 +++++++++++++++++++++++++++--------------------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index ae1e9d00..694399d5 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -80,7 +80,7 @@ SDL::Event::~Event() #endif if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);} if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} - if(segmentsInGPU!= nullptr){cms::cuda::free_host(segmentsInGPU);} + if(segmentsInGPU != nullptr){delete segmentsInGPU;} if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);} if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);} if(hitsInGPU!= nullptr){cms::cuda::free_host(hitsInGPU);} diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 5eb74b30..065bee14 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -26,14 +26,14 @@ namespace SDL Buf seedIdx_buf; Buf mdIndices_buf; + Buf nMemoryLocations_buf; Buf innerMiniDoubletAnchorHitIndices_buf; Buf outerMiniDoubletAnchorHitIndices_buf; - Buf nMemoryLocations_buf; - Buf nSegments_buf; - Buf totOccupancySegments_buf; Buf charge_buf; Buf superbin_buf; + Buf nSegments_buf; + Buf totOccupancySegments_buf; Buf pLSHitsIdxs_buf; @@ -68,14 +68,26 @@ namespace SDL uint16_t* innerLowerModuleIndices; uint16_t* outerLowerModuleIndices; + unsigned int* seedIdx; unsigned int* mdIndices; unsigned int* nMemoryLocations; unsigned int* innerMiniDoubletAnchorHitIndices; unsigned int* outerMiniDoubletAnchorHitIndices; + int* charge; + int* superbin; int* nSegments; //number of segments per inner lower module int* totOccupancySegments; //number of segments per inner lower module + uint4* pLSHitsIdxs; + + int8_t* pixelType; + + char* isQuad; + + bool* isDup; + bool* partOfPT5; + float* ptIn; float* ptErr; float* px; @@ -84,18 +96,10 @@ namespace SDL float* etaErr; float* eta; float* phi; - int* charge; - unsigned int* seedIdx; - int* superbin; - int8_t* pixelType; - char* isQuad; - bool* isDup; float* score; float* circleCenterX; float* circleCenterY; float* circleRadius; - bool* partOfPT5; - uint4* pLSHitsIdxs; template segments(unsigned int nMemoryLocationsIn, @@ -103,19 +107,28 @@ namespace SDL unsigned int maxPixelSegments, TDevAcc const & devAccIn, TQueue& queue) : - mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn*2)), - innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), - totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiChangeMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiChangeMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn*2)), + innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), + nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), + totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), + charge_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments)), ptIn_buf(allocBufWrapper(devAccIn, maxPixelSegments)), ptErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), px_buf(allocBufWrapper(devAccIn, maxPixelSegments)), @@ -124,33 +137,41 @@ namespace SDL etaErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), eta_buf(allocBufWrapper(devAccIn, maxPixelSegments)), phi_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments)), score_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - charge_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments)), circleCenterX_buf(allocBufWrapper(devAccIn, maxPixelSegments)), circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)) + circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments)) { - mdIndices = alpaka::getPtrNative(mdIndices_buf); - innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf); - outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf); - innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf); - outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf); - nSegments = alpaka::getPtrNative(nSegments_buf); - totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf); dPhis = alpaka::getPtrNative(dPhis_buf); dPhiMins = alpaka::getPtrNative(dPhiMins_buf); dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf); dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf); dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf); dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf); + + innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf); + outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf); + + seedIdx = alpaka::getPtrNative(seedIdx_buf); + mdIndices = alpaka::getPtrNative(mdIndices_buf); + nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf); + innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf); + outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf); + + charge = alpaka::getPtrNative(charge_buf); + superbin = alpaka::getPtrNative(superbin_buf); + nSegments = alpaka::getPtrNative(nSegments_buf); + totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf); + + pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf); + + pixelType = alpaka::getPtrNative(pixelType_buf); + + isQuad = alpaka::getPtrNative(isQuad_buf); + + isDup = alpaka::getPtrNative(isDup_buf); + partOfPT5 = alpaka::getPtrNative(partOfPT5_buf); + ptIn = alpaka::getPtrNative(ptIn_buf); ptErr = alpaka::getPtrNative(ptErr_buf); px = alpaka::getPtrNative(px_buf); @@ -159,19 +180,10 @@ namespace SDL etaErr = alpaka::getPtrNative(etaErr_buf); eta = alpaka::getPtrNative(eta_buf); phi = alpaka::getPtrNative(phi_buf); - superbin = alpaka::getPtrNative(superbin_buf); - pixelType = alpaka::getPtrNative(pixelType_buf); - isQuad = alpaka::getPtrNative(isQuad_buf); - isDup = alpaka::getPtrNative(isDup_buf); score = alpaka::getPtrNative(score_buf); - charge = alpaka::getPtrNative(charge_buf); - seedIdx = alpaka::getPtrNative(seedIdx_buf); circleCenterX = alpaka::getPtrNative(circleCenterX_buf); circleCenterY = alpaka::getPtrNative(circleCenterY_buf); circleRadius = alpaka::getPtrNative(circleRadius_buf); - partOfPT5 = alpaka::getPtrNative(partOfPT5_buf); - pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf); - nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf); alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1); alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1); From d7d466ed38907a70d89a1c5747e33885f7b00b2d Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 4 Jun 2023 17:57:26 -0400 Subject: [PATCH 15/44] first working hits.cu to alpaka memory --- SDL/Event.cu | 293 ++++++---------------------------- SDL/Event.cuh | 8 +- SDL/Hit.cu | 149 ----------------- SDL/Hit.cuh | 222 +++++++++++++++++++++++--- SDL/LST.cc | 2 +- SDL/MiniDoublet.cuh | 11 +- SDL/Segment.cuh | 52 +++++- SDL/TrackCandidate.cuh | 4 +- SDL/Triplet.cuh | 3 - code/core/AccessHelper.cc | 16 +- code/core/write_sdl_ntuple.cc | 22 ++- 11 files changed, 332 insertions(+), 450 deletions(-) delete mode 100644 SDL/Hit.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index 694399d5..cb8e4747 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -61,7 +61,6 @@ SDL::Event::~Event() { #ifdef CACHE_ALLOC if(rangesInGPU){rangesInGPU->freeMemoryCache();} - if(hitsInGPU){hitsInGPU->freeMemoryCache();} if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} @@ -70,7 +69,6 @@ SDL::Event::~Event() if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(rangesInGPU){rangesInGPU->freeMemory();} - if(hitsInGPU){hitsInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} @@ -83,19 +81,13 @@ SDL::Event::~Event() if(segmentsInGPU != nullptr){delete segmentsInGPU;} if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);} if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);} - if(hitsInGPU!= nullptr){cms::cuda::free_host(hitsInGPU);} + if(hitsInGPU!= nullptr){delete hitsInGPU;} if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);} if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);} if(hitsInCPU != nullptr) { - delete[] hitsInCPU->idxs; - delete[] hitsInCPU->xs; - delete[] hitsInCPU->ys; - delete[] hitsInCPU->zs; - delete[] hitsInCPU->moduleIndices; - delete hitsInCPU->nHits; delete hitsInCPU; } if(rangesInCPU != nullptr) @@ -200,7 +192,6 @@ SDL::Event::~Event() delete trackCandidatesInCPU; } - if(modulesInCPU != nullptr) { delete[] modulesInCPU->nLowerModules; @@ -247,7 +238,6 @@ SDL::Event::~Event() void SDL::Event::resetEvent() { #ifdef CACHE_ALLOC - if(hitsInGPU){hitsInGPU->freeMemoryCache();} if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(rangesInGPU){rangesInGPU->freeMemoryCache();} @@ -256,7 +246,6 @@ void SDL::Event::resetEvent() if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else - if(hitsInGPU){hitsInGPU->freeMemory();} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(rangesInGPU){rangesInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} @@ -284,7 +273,7 @@ void SDL::Event::resetEvent() n_quintuplets_by_layer_endcap_[i] = 0; } } - if(hitsInGPU){cms::cuda::free_host(hitsInGPU); + if(hitsInGPU){delete hitsInGPU; hitsInGPU = nullptr;} if(mdsInGPU){cms::cuda::free_host(mdsInGPU); mdsInGPU = nullptr;} @@ -305,12 +294,6 @@ void SDL::Event::resetEvent() if(hitsInCPU != nullptr) { - delete[] hitsInCPU->idxs; - delete[] hitsInCPU->xs; - delete[] hitsInCPU->ys; - delete[] hitsInCPU->zs; - delete[] hitsInCPU->moduleIndices; - delete hitsInCPU->nHits; delete hitsInCPU; hitsInCPU = nullptr; } @@ -321,7 +304,6 @@ void SDL::Event::resetEvent() delete rangesInCPU; rangesInCPU = nullptr; } - if(mdsInCPU != nullptr) { delete[] mdsInCPU->anchorHitIndices; @@ -330,13 +312,11 @@ void SDL::Event::resetEvent() delete mdsInCPU; mdsInCPU = nullptr; } - if(segmentsInCPU != nullptr) { delete segmentsInCPU; segmentsInCPU = nullptr; } - if(tripletsInCPU != nullptr) { delete[] tripletsInCPU->segmentIndices; @@ -381,7 +361,6 @@ void SDL::Event::resetEvent() delete pixelTripletsInCPU; pixelTripletsInCPU = nullptr; } - if(pixelQuintupletsInCPU != nullptr) { delete[] pixelQuintupletsInCPU->pixelIndices; @@ -407,7 +386,6 @@ void SDL::Event::resetEvent() delete trackCandidatesInCPU; trackCandidatesInCPU = nullptr; } - if(modulesInCPU != nullptr) { delete[] modulesInCPU->nLowerModules; @@ -444,14 +422,11 @@ void SDL::Event::resetEvent() delete[] modulesInCPUFull->r; delete[] modulesInCPUFull->isInverted; delete[] modulesInCPUFull->isLower; - - delete[] modulesInCPUFull->moduleType; delete[] modulesInCPUFull->moduleLayerType; delete[] modulesInCPUFull; modulesInCPUFull = nullptr; } - } void SDL::initModules(const char* moduleMetaDataFilePath) @@ -480,154 +455,39 @@ void SDL::Event::resetObjectsInModule() resetObjectRanges(*rangesInGPU,nModules,stream); } -ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search( - unsigned int *data, // Array that we are searching over - unsigned int search_val, // Value we want to find in data array - unsigned int ndata) // Number of elements in data array -{ - unsigned int low = 0; - unsigned int high = ndata - 1; - - while(low <= high) - { - unsigned int mid = (low + high)/2; - unsigned int test_val = data[mid]; - if (test_val == search_val) - return mid; - else if (test_val > search_val) - high = mid - 1; - else - low = mid + 1; - } - // Couldn't find search value in array. - return -1; -} - -struct moduleRangesKernel -{ - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void operator()( - TAcc const & acc, - struct SDL::modules *modulesInGPU, - struct SDL::hits *hitsInGPU, - int const & nLowerModules) const - { - using Dim = alpaka::Dim; - using Idx = alpaka::Idx; - using Vec = alpaka::Vec; - - Vec const globalThreadIdx = alpaka::getIdx(acc); - Vec const gridThreadExtent = alpaka::getWorkDiv(acc); - - for(int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2]) - { - uint16_t upperIndex = modulesInGPU->partnerModuleIndices[lowerIndex]; - if (hitsInGPU->hitRanges[lowerIndex * 2] != -1 && hitsInGPU->hitRanges[upperIndex * 2] != -1) - { - hitsInGPU->hitRangesLower[lowerIndex] = hitsInGPU->hitRanges[lowerIndex * 2]; - hitsInGPU->hitRangesUpper[lowerIndex] = hitsInGPU->hitRanges[upperIndex * 2]; - hitsInGPU->hitRangesnLower[lowerIndex] = hitsInGPU->hitRanges[lowerIndex * 2 + 1] - hitsInGPU->hitRanges[lowerIndex * 2] + 1; - hitsInGPU->hitRangesnUpper[lowerIndex] = hitsInGPU->hitRanges[upperIndex * 2 + 1] - hitsInGPU->hitRanges[upperIndex * 2] + 1; - } - } - } -}; - -struct hitLoopKernel -{ - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void operator()( - TAcc const & acc, - uint16_t Endcap, // Integer corresponding to endcap in module subdets - uint16_t TwoS, // Integer corresponding to TwoS in moduleType - unsigned int nModules, // Number of modules - unsigned int nEndCapMap, // Number of elements in endcap map - unsigned int* geoMapDetId, // DetId's from endcap map - float* geoMapPhi, // Phi values from endcap map - struct SDL::modules *modulesInGPU, - struct SDL::hits *hitsInGPU, - int const & nHits) const // Total number of hits in event - { - using Dim = alpaka::Dim; - using Idx = alpaka::Idx; - using Vec = alpaka::Vec; - - Vec const globalThreadIdx = alpaka::getIdx(acc); - Vec const gridThreadExtent = alpaka::getWorkDiv(acc); - - for(int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2]) - { - float ihit_x = hitsInGPU->xs[ihit]; - float ihit_y = hitsInGPU->ys[ihit]; - float ihit_z = hitsInGPU->zs[ihit]; - int iDetId = hitsInGPU->detid[ihit]; - - hitsInGPU->rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y); - hitsInGPU->phis[ihit] = SDL::phi(acc, ihit_x,ihit_y); - // Acosh has no supported implementation in Alpaka right now. - hitsInGPU->etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU->rts[ihit]); - int found_index = binary_search(modulesInGPU->mapdetId, iDetId, nModules); - uint16_t lastModuleIndex = modulesInGPU->mapIdx[found_index]; - - hitsInGPU->moduleIndices[ihit] = lastModuleIndex; - - if(modulesInGPU->subdets[lastModuleIndex] == Endcap && modulesInGPU->moduleType[lastModuleIndex] == TwoS) - { - int found_index = binary_search(geoMapDetId, iDetId, nEndCapMap); - float phi = 0; - // Unclear why these are not in map, but CPU map returns phi = 0 for all exceptions. - if (found_index != -1) - phi = geoMapPhi[found_index]; - float cos_phi = alpaka::math::cos(acc, phi); - hitsInGPU->highEdgeXs[ihit] = ihit_x + 2.5f * cos_phi; - hitsInGPU->lowEdgeXs[ihit] = ihit_x - 2.5f * cos_phi; - float sin_phi = alpaka::math::sin(acc, phi); - hitsInGPU->highEdgeYs[ihit] = ihit_y + 2.5f * sin_phi; - hitsInGPU->lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi; - } - // Need to set initial value if index hasn't been seen before. - int old = alpaka::atomicOp(acc, &(hitsInGPU->hitRanges[lastModuleIndex * 2]), -1, ihit); - // For subsequent visits, stores the min value. - if (old != -1) - alpaka::atomicOp(acc, &hitsInGPU->hitRanges[lastModuleIndex * 2], ihit); - - alpaka::atomicOp(acc, &hitsInGPU->hitRanges[lastModuleIndex * 2 + 1], ihit); - } - } -}; - void SDL::Event::addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple) { // Use the actual number of hits instead of a max. const int nHits = x.size(); + // Needed for the memcpy to hitsInGPU below. + auto nHits_buf = allocBufWrapper(devHost, 1); + *alpaka::getPtrNative(nHits_buf) = nHits; + // Get current device for future use. cudaGetDevice(&dev); // Initialize space on device/host for next event. if (hitsInGPU == nullptr) { - hitsInGPU = (SDL::hits*)cms::cuda::allocate_host(sizeof(SDL::hits), stream); - // Unclear why but this has to be 2*nHits to avoid crashing. - createHitsInExplicitMemory(*hitsInGPU, nModules, 2*nHits, stream, 1); + hitsInGPU = new SDL::hits(nModules, nHits, devAcc, queue); } + if (rangesInGPU == nullptr) { rangesInGPU = (SDL::objectRanges*)cms::cuda::allocate_host(sizeof(SDL::objectRanges), stream); - createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules); + createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules); resetObjectsInModule(); } - cudaStreamSynchronize(stream); + // Copy the host arrays to the GPU. - cudaMemcpyAsync(hitsInGPU->xs, &x[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitsInGPU->ys, &y[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitsInGPU->zs, &z[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitsInGPU->detid, &detId[0], nHits*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitsInGPU->idxs, &idxInNtuple[0], nHits*sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(hitsInGPU->nHits, &nHits, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, hitsInGPU->xs_buf, x, nHits); + alpaka::memcpy(queue, hitsInGPU->ys_buf, y, nHits); + alpaka::memcpy(queue, hitsInGPU->zs_buf, z, nHits); + alpaka::memcpy(queue, hitsInGPU->detid_buf, detId, nHits); + alpaka::memcpy(queue, hitsInGPU->idxs_buf, idxInNtuple, nHits); + alpaka::memcpy(queue, hitsInGPU->nHits_buf, nHits_buf, 1); + alpaka::wait(queue); Vec const threadsPerBlock1(static_cast(1), static_cast(1), static_cast(256)); Vec const blocksPerGrid1(static_cast(1), static_cast(1), static_cast(MAX_BLOCKS)); @@ -643,12 +503,11 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: SDL::endcapGeometry.nEndCapMap, SDL::endcapGeometry.geoMapDetId, SDL::endcapGeometry.geoMapPhi, - modulesInGPU, - hitsInGPU, + *modulesInGPU, + *hitsInGPU, nHits)); alpaka::enqueue(queue, hit_loop_task); - alpaka::wait(queue); Vec const threadsPerBlock2(static_cast(1), static_cast(1), static_cast(256)); Vec const blocksPerGrid2(static_cast(1), static_cast(1), static_cast(MAX_BLOCKS)); @@ -658,8 +517,8 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: auto const module_ranges_task(alpaka::createTaskKernel( module_ranges_workdiv, module_ranges_kernel, - modulesInGPU, - hitsInGPU, + *modulesInGPU, + *hitsInGPU, nLowerModules)); // Waiting isn't needed after second kernel call. Saves ~100 us. @@ -668,57 +527,6 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: alpaka::enqueue(queue, module_ranges_task); } -struct addPixelSegmentToEventKernel -{ - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void operator()( - TAcc const & acc, - struct SDL::modules& modulesInGPU, - struct SDL::objectRanges& rangesInGPU, - struct SDL::hits& hitsInGPU, - struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, - unsigned int* hitIndices0, - unsigned int* hitIndices1, - unsigned int* hitIndices2, - unsigned int* hitIndices3, - float* dPhiChange, - uint16_t pixelModuleIndex, - const int size) const - { - using Dim = alpaka::Dim; - using Idx = alpaka::Idx; - using Vec = alpaka::Vec; - - Vec const globalThreadIdx = alpaka::getIdx(acc); - Vec const gridThreadExtent = alpaka::getWorkDiv(acc); - - for(int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2]) - { - unsigned int innerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid); - unsigned int outerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid) +1; - unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + tid; - - addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices0[tid], hitIndices1[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,innerMDIndex); - addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex); - - //in outer hits - pt, eta, phi - float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]); - float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]]; - float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]); - score_lsq = score_lsq * score_lsq; - - unsigned int hits1[4]; - hits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[innerMDIndex]]; - hits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[outerMDIndex]]; - hits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[innerMDIndex]]; - hits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[outerMDIndex]]; - addPixelSegmentToMemory(acc, segmentsInGPU, mdsInGPU, innerMDIndex, outerMDIndex, pixelModuleIndex, hits1, hitIndices0[tid], hitIndices2[tid], dPhiChange[tid], pixelSegmentIndex, tid, score_lsq); - } - } -}; - void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,std::vector hitIndices1,std::vector hitIndices2,std::vector hitIndices3, std::vector dPhiChange, std::vector ptIn, std::vector ptErr, std::vector px, std::vector py, std::vector pz, std::vector eta, std::vector etaErr, std::vector phi, std::vector charge, std::vector seedIdx, std::vector superbin, std::vector pixelType, std::vector isQuad) { const int size = ptIn.size(); @@ -1016,7 +824,6 @@ void SDL::Event::createMiniDoublets() { addMiniDoubletsToEventExplicit(); } - } void SDL::Event::createSegmentsWithModuleMap() @@ -1414,7 +1221,6 @@ void SDL::Event::createPixelTriplets() cms::cuda::free_device(dev, connectedPixelSize_dev); cms::cuda::free_device(dev, connectedPixelIndex_dev); - #ifdef Warnings int nPixelTriplets; cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream); @@ -1916,6 +1722,7 @@ int SDL::Event::getNumberOfPixelQuintuplets() cudaStreamSynchronize(stream); return nPixelQuintuplets; } + unsigned int SDL::Event::getNumberOfQuintuplets() { unsigned int quintuplets = 0; @@ -2003,45 +1810,43 @@ int SDL::Event::getNumberOfT5TrackCandidates() return nTrackCandidatesT5; } -SDL::hits* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection +SDL::hits* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection { if(hitsInCPU == nullptr) { - hitsInCPU = new SDL::hits; - hitsInCPU->nHits = new unsigned int; - unsigned int nHits; - cudaMemcpyAsync(&nHits, hitsInGPU->nHits, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - *(hitsInCPU->nHits) = nHits; - hitsInCPU->idxs = new unsigned int[nHits]; - hitsInCPU->detid = new unsigned int[nHits]; - hitsInCPU->xs = new float[nHits]; - hitsInCPU->ys = new float[nHits]; - hitsInCPU->zs = new float[nHits]; - hitsInCPU->moduleIndices = new uint16_t[nHits]; - cudaMemcpyAsync(hitsInCPU->idxs, hitsInGPU->idxs,sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(hitsInCPU->detid, hitsInGPU->detid, sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(hitsInCPU->xs, hitsInGPU->xs, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(hitsInCPU->ys, hitsInGPU->ys, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(hitsInCPU->zs, hitsInGPU->zs, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(hitsInCPU->moduleIndices, hitsInGPU->moduleIndices, sizeof(uint16_t) * nHits, cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nHits_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1); + alpaka::wait(queue); + + unsigned int nHits = *alpaka::getPtrNative(nHits_buf); + hitsInCPU = new SDL::hits(nModules, nHits, devHost, queue); + + *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsInGPU->detid_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsInGPU->xs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsInGPU->ys_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsInGPU->zs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsInGPU->moduleIndices_buf, nHits); + alpaka::wait(queue); } return hitsInCPU; } -SDL::hits* SDL::Event::getHitsInCMSSW() +SDL::hits* SDL::Event::getHitsInCMSSW() { if(hitsInCPU == nullptr) { - hitsInCPU = new SDL::hits; - hitsInCPU->nHits = new unsigned int; - unsigned int nHits; - cudaMemcpyAsync(&nHits, hitsInGPU->nHits, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - hitsInCPU->idxs = new unsigned int[nHits]; - cudaMemcpyAsync(hitsInCPU->idxs, hitsInGPU->idxs,sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nHits_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1); + alpaka::wait(queue); + + unsigned int nHits = *alpaka::getPtrNative(nHits_buf); + hitsInCPU = new SDL::hits(nModules, nHits, devHost, queue); + + *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits); + alpaka::wait(queue); } return hitsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index f5b8327c..4b431d33 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -40,7 +40,7 @@ namespace SDL int dev; int nTotalSegments; struct objectRanges* rangesInGPU; - struct hits* hitsInGPU; + struct hits* hitsInGPU; struct miniDoublets* mdsInGPU; struct segments* segmentsInGPU; struct triplets* tripletsInGPU; @@ -51,7 +51,7 @@ namespace SDL //CPU interface stuff objectRanges* rangesInCPU; - hits* hitsInCPU; + hits* hitsInCPU; miniDoublets* mdsInCPU; segments* segmentsInCPU; triplets* tripletsInCPU; @@ -130,8 +130,8 @@ namespace SDL unsigned int getNumberOfT3T3ExtendedTracks(); objectRanges* getRanges(); - hits* getHits(); - hits* getHitsInCMSSW(); + hits* getHits(); + hits* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); segments* getSegments() ; triplets* getTriplets(); diff --git a/SDL/Hit.cu b/SDL/Hit.cu deleted file mode 100644 index 79f887e0..00000000 --- a/SDL/Hit.cu +++ /dev/null @@ -1,149 +0,0 @@ -# include "Hit.cuh" - -SDL::hits::hits() -{ - nHits = nullptr; - xs = nullptr; - ys = nullptr; - zs = nullptr; - moduleIndices = nullptr; - detid = nullptr; - rts = nullptr; - phis = nullptr; - etas = nullptr; - highEdgeXs = nullptr; - highEdgeYs = nullptr; - lowEdgeXs = nullptr; - lowEdgeYs = nullptr; - hitRanges = nullptr; - hitRangesLower = nullptr; - hitRangesUpper = nullptr; - hitRangesnLower = nullptr; - hitRangesnUpper = nullptr; -} - -SDL::hits::~hits() -{ -} - -void SDL::createHitsInExplicitMemory(struct hits& hitsInGPU, int nModules, unsigned int nMaxHits,cudaStream_t stream,unsigned int evtnum) -{ -#if defined(CACHE_ALLOC) - int dev; - cudaGetDevice(&dev); - hitsInGPU.xs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.ys = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.zs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - - hitsInGPU.rts = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.phis = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.etas = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - - hitsInGPU.moduleIndices = (uint16_t*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(uint16_t),stream); - hitsInGPU.idxs = (unsigned int*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(unsigned int),stream); - hitsInGPU.detid = (unsigned int*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(unsigned int),stream); - - hitsInGPU.highEdgeXs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.highEdgeYs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.lowEdgeXs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - hitsInGPU.lowEdgeYs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream); - - hitsInGPU.nHits = (unsigned int*)cms::cuda::allocate_device(dev,evtnum*sizeof(unsigned int),stream); - - hitsInGPU.hitRanges = (int*)cms::cuda::allocate_device(dev, evtnum*nModules * 2 * sizeof(int),stream); - hitsInGPU.hitRangesLower = (int*)cms::cuda::allocate_device(dev, evtnum*nModules * sizeof(int),stream); - hitsInGPU.hitRangesUpper = (int*)cms::cuda::allocate_device(dev, evtnum*nModules * sizeof(int),stream); - hitsInGPU.hitRangesnLower = (int8_t*)cms::cuda::allocate_device(dev,evtnum*nModules * sizeof(int8_t),stream); - hitsInGPU.hitRangesnUpper = (int8_t*)cms::cuda::allocate_device(dev,evtnum*nModules * sizeof(int8_t),stream); -#else - cudaMalloc(&hitsInGPU.xs, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.ys, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.zs, nMaxHits * sizeof(float)); - - cudaMalloc(&hitsInGPU.moduleIndices, nMaxHits * sizeof(uint16_t)); - cudaMalloc(&hitsInGPU.idxs, nMaxHits * sizeof(unsigned int)); - cudaMalloc(&hitsInGPU.detid, nMaxHits * sizeof(unsigned int)); - - cudaMalloc(&hitsInGPU.rts, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.phis, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.etas, nMaxHits * sizeof(float)); - - cudaMalloc(&hitsInGPU.highEdgeXs, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.highEdgeYs, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.lowEdgeXs, nMaxHits * sizeof(float)); - cudaMalloc(&hitsInGPU.lowEdgeYs, nMaxHits * sizeof(float)); - - //counters - cudaMalloc(&hitsInGPU.nHits,evtnum* sizeof(unsigned int)); - - cudaMalloc(&hitsInGPU.hitRanges,evtnum*nModules * 2 * sizeof(int)); - cudaMalloc(&hitsInGPU.hitRangesLower,evtnum*nModules * sizeof(int)); - cudaMalloc(&hitsInGPU.hitRangesUpper,evtnum*nModules * sizeof(int)); - cudaMalloc(&hitsInGPU.hitRangesnLower,evtnum*nModules * sizeof(int8_t)); - cudaMalloc(&hitsInGPU.hitRangesnUpper,evtnum* nModules * sizeof(int8_t)); -#endif - cudaMemsetAsync(hitsInGPU.nHits,0,evtnum*sizeof(unsigned int),stream); - cudaMemsetAsync(hitsInGPU.hitRanges, -1, evtnum*nModules*2*sizeof(int),stream); - cudaMemsetAsync(hitsInGPU.hitRangesLower, -1, evtnum*nModules*sizeof(int),stream); - cudaMemsetAsync(hitsInGPU.hitRangesUpper, -1, evtnum*nModules*sizeof(int),stream); - cudaMemsetAsync(hitsInGPU.hitRangesnLower, -1,evtnum*nModules*sizeof(int8_t),stream); - cudaMemsetAsync(hitsInGPU.hitRangesnUpper, -1,evtnum*nModules*sizeof(int8_t),stream); - cudaStreamSynchronize(stream); -} - -void SDL::printHit(struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int hitIndex) -{ - std::cout << "Hit(x=" << hitsInGPU.xs[hitIndex] << ", y=" << hitsInGPU.ys[hitIndex] << ", z=" << hitsInGPU.zs[hitIndex] << ", rt=" << hitsInGPU.rts[hitIndex] << ", phi=" << hitsInGPU.phis[hitIndex] <<", module subdet = "< - #include "Constants.cuh" #include "Module.cuh" -#include "allocate.h" namespace SDL { + template struct hits { - unsigned int *nHits; //single number - float *xs; - float *ys; - float *zs; + Buf nHits_buf; + Buf xs_buf; + Buf ys_buf; + Buf zs_buf; + Buf moduleIndices_buf; + Buf idxs_buf; + Buf detid_buf; + Buf rts_buf; + Buf phis_buf; + Buf etas_buf; + Buf highEdgeXs_buf; + Buf highEdgeYs_buf; + Buf lowEdgeXs_buf; + Buf lowEdgeYs_buf; + Buf hitRanges_buf; + Buf hitRangesLower_buf; + Buf hitRangesUpper_buf; + Buf hitRangesnLower_buf; + Buf hitRangesnUpper_buf; + unsigned int* nHits; + float* xs; + float* ys; + float* zs; uint16_t* moduleIndices; unsigned int* idxs; unsigned int* detid; - - float *rts; + float* rts; float* phis; float* etas; - - float *highEdgeXs; - float *highEdgeYs; - float *lowEdgeXs; - float *lowEdgeYs; - + float* highEdgeXs; + float* highEdgeYs; + float* lowEdgeXs; + float* lowEdgeYs; int* hitRanges; int* hitRangesLower; int* hitRangesUpper; int8_t* hitRangesnLower; int8_t* hitRangesnUpper; - - hits(); - void freeMemory(); - void freeMemoryCache(); - ~hits(); - }; - void printHit(struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int hitIndex); - void createHitsInExplicitMemory(struct hits& hitsInGPU, int nModules, unsigned int maxHits,cudaStream_t stream,unsigned int evtnum); + template + hits(unsigned int nModules, + unsigned int nMaxHits, + TDevAcc const & devAccIn, + TQueue& queue) : + nHits_buf(allocBufWrapper(devAccIn, 1)), + xs_buf(allocBufWrapper(devAccIn, nMaxHits)), + ys_buf(allocBufWrapper(devAccIn, nMaxHits)), + zs_buf(allocBufWrapper(devAccIn, nMaxHits)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMaxHits)), + idxs_buf(allocBufWrapper(devAccIn, nMaxHits)), + detid_buf(allocBufWrapper(devAccIn, nMaxHits)), + rts_buf(allocBufWrapper(devAccIn, nMaxHits)), + phis_buf(allocBufWrapper(devAccIn, nMaxHits)), + etas_buf(allocBufWrapper(devAccIn, nMaxHits)), + highEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits)), + highEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits)), + lowEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits)), + lowEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits)), + hitRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules)) + { + nHits = alpaka::getPtrNative(nHits_buf); + xs = alpaka::getPtrNative(xs_buf); + ys = alpaka::getPtrNative(ys_buf); + zs = alpaka::getPtrNative(zs_buf); + moduleIndices = alpaka::getPtrNative(moduleIndices_buf); + idxs = alpaka::getPtrNative(idxs_buf); + detid = alpaka::getPtrNative(detid_buf); + rts = alpaka::getPtrNative(rts_buf); + phis = alpaka::getPtrNative(phis_buf); + etas = alpaka::getPtrNative(etas_buf); + highEdgeXs = alpaka::getPtrNative(highEdgeXs_buf); + highEdgeYs = alpaka::getPtrNative(highEdgeYs_buf); + lowEdgeXs = alpaka::getPtrNative(lowEdgeXs_buf); + lowEdgeYs = alpaka::getPtrNative(lowEdgeYs_buf); + hitRanges = alpaka::getPtrNative(hitRanges_buf); + hitRangesLower = alpaka::getPtrNative(hitRangesLower_buf); + hitRangesUpper = alpaka::getPtrNative(hitRangesUpper_buf); + hitRangesnLower = alpaka::getPtrNative(hitRangesnLower_buf); + hitRangesnUpper = alpaka::getPtrNative(hitRangesnUpper_buf); + + alpaka::memset(queue, hitRanges_buf, -1, nModules*2); + alpaka::memset(queue, hitRangesLower_buf, -1, nModules); + alpaka::memset(queue, hitRangesUpper_buf, -1, nModules); + alpaka::memset(queue, hitRangesnLower_buf, -1, nModules); + alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules); + alpaka::wait(queue); + } + }; // Hyperbolic functions were just merged into Alpaka early 2023, // so we have to make use of temporary functions for now. @@ -123,5 +182,122 @@ namespace SDL return dPhi; }; + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search( + unsigned int *data, // Array that we are searching over + unsigned int search_val, // Value we want to find in data array + unsigned int ndata) // Number of elements in data array + { + unsigned int low = 0; + unsigned int high = ndata - 1; + + while(low <= high) + { + unsigned int mid = (low + high)/2; + unsigned int test_val = data[mid]; + if (test_val == search_val) + return mid; + else if (test_val > search_val) + high = mid - 1; + else + low = mid + 1; + } + // Couldn't find search value in array. + return -1; + }; + + struct moduleRangesKernel + { + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC void operator()( + TAcc const & acc, + struct SDL::modules& modulesInGPU, + struct SDL::hits& hitsInGPU, + int const & nLowerModules) const + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + + Vec const globalThreadIdx = alpaka::getIdx(acc); + Vec const gridThreadExtent = alpaka::getWorkDiv(acc); + + for(int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2]) + { + uint16_t upperIndex = modulesInGPU.partnerModuleIndices[lowerIndex]; + if (hitsInGPU.hitRanges[lowerIndex * 2] != -1 && hitsInGPU.hitRanges[upperIndex * 2] != -1) + { + hitsInGPU.hitRangesLower[lowerIndex] = hitsInGPU.hitRanges[lowerIndex * 2]; + hitsInGPU.hitRangesUpper[lowerIndex] = hitsInGPU.hitRanges[upperIndex * 2]; + hitsInGPU.hitRangesnLower[lowerIndex] = hitsInGPU.hitRanges[lowerIndex * 2 + 1] - hitsInGPU.hitRanges[lowerIndex * 2] + 1; + hitsInGPU.hitRangesnUpper[lowerIndex] = hitsInGPU.hitRanges[upperIndex * 2 + 1] - hitsInGPU.hitRanges[upperIndex * 2] + 1; + } + } + } + }; + + struct hitLoopKernel + { + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC void operator()( + TAcc const & acc, + uint16_t Endcap, // Integer corresponding to endcap in module subdets + uint16_t TwoS, // Integer corresponding to TwoS in moduleType + unsigned int nModules, // Number of modules + unsigned int nEndCapMap, // Number of elements in endcap map + unsigned int* geoMapDetId, // DetId's from endcap map + float* geoMapPhi, // Phi values from endcap map + struct SDL::modules& modulesInGPU, + struct SDL::hits& hitsInGPU, + unsigned int const & nHits) const // Total number of hits in event + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + + Vec const globalThreadIdx = alpaka::getIdx(acc); + Vec const gridThreadExtent = alpaka::getWorkDiv(acc); + for(int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2]) + { + float ihit_x = hitsInGPU.xs[ihit]; + float ihit_y = hitsInGPU.ys[ihit]; + float ihit_z = hitsInGPU.zs[ihit]; + int iDetId = hitsInGPU.detid[ihit]; + + hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y); + hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y); + // Acosh has no supported implementation in Alpaka right now. + hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]); + int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules); + uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index]; + + hitsInGPU.moduleIndices[ihit] = lastModuleIndex; + + if(modulesInGPU.subdets[lastModuleIndex] == Endcap && modulesInGPU.moduleType[lastModuleIndex] == TwoS) + { + found_index = binary_search(geoMapDetId, iDetId, nEndCapMap); + float phi = 0; + // Unclear why these are not in map, but CPU map returns phi = 0 for all exceptions. + if (found_index != -1) + phi = geoMapPhi[found_index]; + float cos_phi = alpaka::math::cos(acc, phi); + hitsInGPU.highEdgeXs[ihit] = ihit_x + 2.5f * cos_phi; + hitsInGPU.lowEdgeXs[ihit] = ihit_x - 2.5f * cos_phi; + float sin_phi = alpaka::math::sin(acc, phi); + hitsInGPU.highEdgeYs[ihit] = ihit_y + 2.5f * sin_phi; + hitsInGPU.lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi; + } + // Need to set initial value if index hasn't been seen before. + int old = alpaka::atomicOp(acc, &(hitsInGPU.hitRanges[lastModuleIndex * 2]), -1, ihit); + // For subsequent visits, stores the min value. + if (old != -1) + alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2], ihit); + + alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2 + 1], ihit); + } + } + }; } #endif \ No newline at end of file diff --git a/SDL/LST.cc b/SDL/LST.cc index 23253a95..83481428 100644 --- a/SDL/LST.cc +++ b/SDL/LST.cc @@ -403,7 +403,7 @@ void SDL::LST::getOutput(SDL::Event& event) { std::vector tc_seedIdx_; std::vector tc_trackCandidateType_; - SDL::hits& hitsInGPU = (*event.getHitsInCMSSW()); + SDL::hits& hitsInGPU = (*event.getHitsInCMSSW()); SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW()); unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates; diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 4f136336..cfc81f55 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -68,7 +68,8 @@ namespace SDL void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx) + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx) { //the index into which this MD needs to be written will be computed in the kernel //nMDs variable will be incremented in the kernel, no need to worry about that here @@ -659,7 +660,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::objectRanges& rangesInGPU) const { @@ -676,6 +677,10 @@ namespace SDL int nLowerHits = hitsInGPU.hitRangesnLower[lowerModuleIndex]; int nUpperHits = hitsInGPU.hitRangesnUpper[lowerModuleIndex]; if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) continue; + if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) + { + printf("IS THIS EVER RUN"); + } const int maxHits = alpaka::math::max(acc, nUpperHits, nLowerHits); unsigned int upHitArrayIndex = hitsInGPU.hitRangesUpper[lowerModuleIndex]; unsigned int loHitArrayIndex = hitsInGPU.hitRangesLower[lowerModuleIndex]; @@ -799,7 +804,7 @@ namespace SDL struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct objectRanges& rangesInGPU, - struct hits& hitsInGPU) const + struct SDL::hits& hitsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 065bee14..85a1df54 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -189,7 +189,6 @@ namespace SDL alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1); alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments); alpaka::memset(queue, pLSHitsIdxs_buf, 0u, maxPixelSegments); - alpaka::memset(queue, nMemoryLocations_buf, nMemoryLocationsIn, 1); alpaka::wait(queue); } }; @@ -882,6 +881,57 @@ namespace SDL } } }; + + struct addPixelSegmentToEventKernel + { + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC void operator()( + TAcc const & acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::hits& hitsInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + unsigned int* hitIndices0, + unsigned int* hitIndices1, + unsigned int* hitIndices2, + unsigned int* hitIndices3, + float* dPhiChange, + uint16_t pixelModuleIndex, + const int size) const + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + + Vec const globalThreadIdx = alpaka::getIdx(acc); + Vec const gridThreadExtent = alpaka::getWorkDiv(acc); + + for(int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2]) + { + unsigned int innerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid); + unsigned int outerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid) +1; + unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + tid; + + addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices0[tid], hitIndices1[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,innerMDIndex); + addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex); + + //in outer hits - pt, eta, phi + float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]); + float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]]; + float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]); + score_lsq = score_lsq * score_lsq; + + unsigned int hits1[4]; + hits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[innerMDIndex]]; + hits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[outerMDIndex]]; + hits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[innerMDIndex]]; + hits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[outerMDIndex]]; + addPixelSegmentToMemory(acc, segmentsInGPU, mdsInGPU, innerMDIndex, outerMDIndex, pixelModuleIndex, hits1, hitIndices0[tid], hitIndices2[tid], dPhiChange[tid], pixelSegmentIndex, tid, score_lsq); + } + } + }; } #endif diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index 738037fa..b4564106 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -82,7 +82,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) { int phits1[4] = {-1,-1,-1,-1}; int phits2[4] = {-1,-1,-1,-1}; @@ -245,7 +245,7 @@ namespace SDL struct SDL::trackCandidates& trackCandidatesInGPU, struct SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, struct SDL::quintuplets& quintupletsInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 5baa5a3f..1b1d1063 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -444,9 +444,6 @@ namespace SDL return false; // failsafe }; - template - void printTriplet(struct triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex); - template ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn) { diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 2bf534b2..df6caf6a 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -7,7 +7,7 @@ //____________________________________________________________________________________________ std::tuple, std::vector> convertHitsToHitIdxsAndHitTypes(SDL::Event* event, std::vector hits) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); std::vector hitidxs; std::vector hittypes; for (auto& hit : hits) @@ -48,7 +48,7 @@ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pL //____________________________________________________________________________________________ std::vector getPixelHitIdxsFrompLS(SDL::Event* event, unsigned int pLS) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); std::vector hits = getPixelHitsFrompLS(event, pLS); std::vector hitidxs; for (auto& hit : hits) @@ -203,7 +203,7 @@ std::vector getHitsFromT5(SDL::Event* event, unsigned int T5) //____________________________________________________________________________________________ std::vector getHitIdxsFromT5(SDL::Event* event, unsigned int T5) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFromT5(event, T5); std::vector hitidxs; for (auto& hit : hits) @@ -215,7 +215,7 @@ std::vector getModuleIdxsFromT5(SDL::Event* event, unsigned int T5 { std::vector hits = getHitsFromT5(event, T5); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); @@ -297,7 +297,7 @@ std::vector getHitsFrompT3(SDL::Event* event, unsigned int pT3) //____________________________________________________________________________________________ std::vector getHitIdxsFrompT3(SDL::Event* event, unsigned int pT3) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFrompT3(event, pT3); std::vector hitidxs; for (auto& hit : hits) @@ -309,7 +309,7 @@ std::vector getModuleIdxsFrompT3(SDL::Event* event, unsigned int p { std::vector hits = getOuterTrackerHitsFrompT3(event, pT3); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); @@ -405,7 +405,7 @@ std::vector getHitsFrompT5(SDL::Event* event, unsigned int pT5) //____________________________________________________________________________________________ std::vector getHitIdxsFrompT5(SDL::Event* event, unsigned int pT5) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFrompT5(event, pT5); std::vector hitidxs; for (auto& hit : hits) @@ -418,7 +418,7 @@ std::vector getModuleIdxsFrompT5(SDL::Event* event, unsigned int p { std::vector hits = getOuterTrackerHitsFrompT5(event, pT5); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index dcc9f070..cce8b6f1 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -477,7 +477,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) SDL::triplets& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); SDL::segments& segmentsInGPU = *(event->getSegments()); - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hits& hitsInGPU = *(event->getHits()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; @@ -561,7 +561,7 @@ void setGnnNtupleBranches(SDL::Event* event) // Get relevant information SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); @@ -717,7 +717,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD) { // Get relevant information SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); // Get the hit indices unsigned int hit0 = miniDoubletsInGPU.anchorHitIndices[MD]; @@ -822,7 +822,7 @@ std::tuple, vector> pars SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); SDL::segments& segmentsInGPU = (*event->getSegments()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); // // pictorial representation of a pT5 @@ -960,7 +960,7 @@ std::tuple, vector> pars SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); SDL::segments& segmentsInGPU = (*event->getSegments()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); // // pictorial representation of a pT3 @@ -1006,7 +1006,7 @@ std::tuple, vector> pars { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx]; std::vector T3s = getT3sFromT5(event, T5); std::vector hits = getHitsFromT5(event, T5); @@ -1106,7 +1106,6 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f //________________________________________________________________________________________________________________________________ void printHitMultiplicities(SDL::Event* event) { - //SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1152,7 +1151,7 @@ void printAllObjects(SDL::Event* event) void printMDs(SDL::Event* event) { SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1176,7 +1175,7 @@ void printLSs(SDL::Event* event) { SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1209,7 +1208,7 @@ void printpLSs(SDL::Event* event) { SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1240,7 +1239,7 @@ void printT3s(SDL::Event* event) SDL::triplets& tripletsInGPU = (*event->getTriplets()); SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); int nTriplets = 0; for (unsigned int i = 0; i < *(modulesInGPU.nLowerModules); ++i) @@ -1283,7 +1282,6 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) SDL::triplets& tripletsInGPU = (*event->getTriplets()); SDL::segments& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - //SDL::hits& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); //int nTrackCandidates = 0; From c2a60463023499aba088d6830915df2232dfcfa8 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Tue, 6 Jun 2023 20:58:23 -0400 Subject: [PATCH 16/44] Move Hit.cu + Segments.cu to inheritance technique --- SDL/Event.cu | 121 +++++++++++---------- SDL/Event.cuh | 16 +-- SDL/Hit.cuh | 101 +++++++++-------- SDL/Kernels.cuh | 5 +- SDL/LST.cc | 2 +- SDL/MiniDoublet.cuh | 7 +- SDL/PixelTriplet.cuh | 26 +++-- SDL/Quintuplet.cuh | 14 +-- SDL/Segment.cuh | 198 +++++++++++++++------------------- SDL/TrackCandidate.cuh | 15 ++- SDL/Triplet.cuh | 30 +++--- code/core/AccessHelper.cc | 20 ++-- code/core/write_sdl_ntuple.cc | 40 +++---- 13 files changed, 293 insertions(+), 302 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index cb8e4747..01036c57 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -78,10 +78,10 @@ SDL::Event::~Event() #endif if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);} if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} - if(segmentsInGPU != nullptr){delete segmentsInGPU;} + if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);} if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);} - if(hitsInGPU!= nullptr){delete hitsInGPU;} + if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);} if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);} @@ -273,13 +273,13 @@ void SDL::Event::resetEvent() n_quintuplets_by_layer_endcap_[i] = 0; } } - if(hitsInGPU){delete hitsInGPU; + if(hitsInGPU){delete hitsInGPU; delete hitsBuffers; hitsInGPU = nullptr;} if(mdsInGPU){cms::cuda::free_host(mdsInGPU); mdsInGPU = nullptr;} if(rangesInGPU){cms::cuda::free_host(rangesInGPU); rangesInGPU = nullptr;} - if(segmentsInGPU){delete segmentsInGPU; + if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers; segmentsInGPU = nullptr;} if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU); tripletsInGPU = nullptr;} @@ -470,7 +470,9 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: // Initialize space on device/host for next event. if (hitsInGPU == nullptr) { - hitsInGPU = new SDL::hits(nModules, nHits, devAcc, queue); + hitsInGPU = new SDL::hits(); + hitsBuffers = new SDL::hitsBuffer(nModules, nHits, devAcc, queue); + hitsInGPU->setData(*hitsBuffers); } if (rangesInGPU == nullptr) @@ -481,12 +483,12 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: } // Copy the host arrays to the GPU. - alpaka::memcpy(queue, hitsInGPU->xs_buf, x, nHits); - alpaka::memcpy(queue, hitsInGPU->ys_buf, y, nHits); - alpaka::memcpy(queue, hitsInGPU->zs_buf, z, nHits); - alpaka::memcpy(queue, hitsInGPU->detid_buf, detId, nHits); - alpaka::memcpy(queue, hitsInGPU->idxs_buf, idxInNtuple, nHits); - alpaka::memcpy(queue, hitsInGPU->nHits_buf, nHits_buf, 1); + alpaka::memcpy(queue, hitsBuffers->xs_buf, x, nHits); + alpaka::memcpy(queue, hitsBuffers->ys_buf, y, nHits); + alpaka::memcpy(queue, hitsBuffers->zs_buf, z, nHits); + alpaka::memcpy(queue, hitsBuffers->detid_buf, detId, nHits); + alpaka::memcpy(queue, hitsBuffers->idxs_buf, idxInNtuple, nHits); + alpaka::memcpy(queue, hitsBuffers->nHits_buf, nHits_buf, 1); alpaka::wait(queue); Vec const threadsPerBlock1(static_cast(1), static_cast(1), static_cast(256)); @@ -522,7 +524,7 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: nLowerModules)); // Waiting isn't needed after second kernel call. Saves ~100 us. - // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsinGPU->hitrange variables. + // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsBuffers->hitrange variables. // Also, modulesInGPU->partnerModuleIndices is not alterned in addPixelSegmentToEvent. alpaka::enqueue(queue, module_ranges_task); } @@ -586,7 +588,9 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st cudaStreamSynchronize(stream); nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE; - segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU = new SDL::segments(); + segmentsBuffers = new SDL::segmentsBuffer(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU->setData(*segmentsBuffers); cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);; cudaStreamSynchronize(stream); @@ -604,19 +608,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::memcpy(queue, hitIndices3_dev, hitIndices3, size); alpaka::memcpy(queue, dPhiChange_dev, dPhiChange, size); - alpaka::memcpy(queue, segmentsInGPU->ptIn_buf, ptIn, size); - alpaka::memcpy(queue, segmentsInGPU->ptErr_buf, ptErr, size); - alpaka::memcpy(queue, segmentsInGPU->px_buf, px, size); - alpaka::memcpy(queue, segmentsInGPU->py_buf, py, size); - alpaka::memcpy(queue, segmentsInGPU->pz_buf, pz, size); - alpaka::memcpy(queue, segmentsInGPU->etaErr_buf, etaErr, size); - alpaka::memcpy(queue, segmentsInGPU->isQuad_buf, isQuad, size); - alpaka::memcpy(queue, segmentsInGPU->eta_buf, eta, size); - alpaka::memcpy(queue, segmentsInGPU->phi_buf, phi, size); - alpaka::memcpy(queue, segmentsInGPU->charge_buf, charge, size); - alpaka::memcpy(queue, segmentsInGPU->seedIdx_buf, seedIdx, size); - alpaka::memcpy(queue, segmentsInGPU->superbin_buf, superbin, size); - alpaka::memcpy(queue, segmentsInGPU->pixelType_buf, pixelType, size); + alpaka::memcpy(queue, segmentsBuffers->ptIn_buf, ptIn, size); + alpaka::memcpy(queue, segmentsBuffers->ptErr_buf, ptErr, size); + alpaka::memcpy(queue, segmentsBuffers->px_buf, px, size); + alpaka::memcpy(queue, segmentsBuffers->py_buf, py, size); + alpaka::memcpy(queue, segmentsBuffers->pz_buf, pz, size); + alpaka::memcpy(queue, segmentsBuffers->etaErr_buf, etaErr, size); + alpaka::memcpy(queue, segmentsBuffers->isQuad_buf, isQuad, size); + alpaka::memcpy(queue, segmentsBuffers->eta_buf, eta, size); + alpaka::memcpy(queue, segmentsBuffers->phi_buf, phi, size); + alpaka::memcpy(queue, segmentsBuffers->charge_buf, charge, size); + alpaka::memcpy(queue, segmentsBuffers->seedIdx_buf, seedIdx, size); + alpaka::memcpy(queue, segmentsBuffers->superbin_buf, superbin, size); + alpaka::memcpy(queue, segmentsBuffers->pixelType_buf, pixelType, size); cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); @@ -830,7 +834,9 @@ void SDL::Event::createSegmentsWithModuleMap() { if(segmentsInGPU == nullptr) { - segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU = new SDL::segments(); + segmentsBuffers = new SDL::segmentsBuffer(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU->setData(*segmentsBuffers); } Vec const threadsPerBlockCreateSeg(static_cast(1), static_cast(1), static_cast(64)); @@ -1810,42 +1816,44 @@ int SDL::Event::getNumberOfT5TrackCandidates() return nTrackCandidatesT5; } -SDL::hits* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection +SDL::hitsBuffer* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection { if(hitsInCPU == nullptr) { auto nHits_buf = allocBufWrapper(devHost, 1); - alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1); + alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1); alpaka::wait(queue); unsigned int nHits = *alpaka::getPtrNative(nHits_buf); - hitsInCPU = new SDL::hits(nModules, nHits, devHost, queue); + hitsInCPU = new SDL::hitsBuffer(nModules, nHits, devHost, queue); + hitsInCPU->setData(*hitsInCPU); *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; - alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits); - alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsInGPU->detid_buf, nHits); - alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsInGPU->xs_buf, nHits); - alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsInGPU->ys_buf, nHits); - alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsInGPU->zs_buf, nHits); - alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsInGPU->moduleIndices_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsBuffers->detid_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsBuffers->xs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsBuffers->ys_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsBuffers->zs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsBuffers->moduleIndices_buf, nHits); alpaka::wait(queue); } return hitsInCPU; } -SDL::hits* SDL::Event::getHitsInCMSSW() +SDL::hitsBuffer* SDL::Event::getHitsInCMSSW() { if(hitsInCPU == nullptr) { auto nHits_buf = allocBufWrapper(devHost, 1); - alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1); + alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1); alpaka::wait(queue); unsigned int nHits = *alpaka::getPtrNative(nHits_buf); - hitsInCPU = new SDL::hits(nModules, nHits, devHost, queue); + hitsInCPU = new SDL::hitsBuffer(nModules, nHits, devHost, queue); + hitsInCPU->setData(*hitsInCPU); *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; - alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); alpaka::wait(queue); } return hitsInCPU; @@ -1858,7 +1866,7 @@ SDL::objectRanges* SDL::Event::getRanges() rangesInCPU = new SDL::objectRanges; rangesInCPU->hitRanges = new int[2*nModules]; rangesInCPU->quintupletModuleIndices = new int[nLowerModules]; - cudaMemcpyAsync(rangesInCPU->hitRanges, hitsInGPU->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream); + cudaMemcpyAsync(rangesInCPU->hitRanges, hitsBuffers->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream); rangesInCPU->miniDoubletModuleIndices = new int[nLowerModules+1]; rangesInCPU->segmentModuleIndices = new int[nLowerModules + 1]; rangesInCPU->tripletModuleIndices = new int[nLowerModules]; @@ -1897,31 +1905,32 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets() return mdsInCPU; } -SDL::segments* SDL::Event::getSegments() +SDL::segmentsBuffer* SDL::Event::getSegments() { if(segmentsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based segmentsInCPU auto nMemLocal_buf = allocBufWrapper(devHost, 1); - alpaka::memcpy(queue, nMemLocal_buf, segmentsInGPU->nMemoryLocations_buf, 1); + alpaka::memcpy(queue, nMemLocal_buf, segmentsBuffers->nMemoryLocations_buf, 1); alpaka::wait(queue); unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); - segmentsInCPU = new SDL::segments(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue); + segmentsInCPU = new SDL::segmentsBuffer(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue); + segmentsInCPU->setData(*segmentsInCPU); *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemLocal; - alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsInGPU->nSegments_buf, (nLowerModules+1)); - alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsInGPU->mdIndices_buf, 2 * nMemLocal); - alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->innerMiniDoubletAnchorHitIndices_buf, nMemLocal); - alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->outerMiniDoubletAnchorHitIndices_buf, nMemLocal); - alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsInGPU->totOccupancySegments_buf, (nLowerModules+1)); - alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsInGPU->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsInGPU->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsInGPU->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsInGPU->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsInGPU->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsInGPU->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsInGPU->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsBuffers->nSegments_buf, (nLowerModules+1)); + alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsBuffers->mdIndices_buf, 2 * nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsBuffers->innerMiniDoubletAnchorHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsBuffers->outerMiniDoubletAnchorHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsBuffers->totOccupancySegments_buf, (nLowerModules+1)); + alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsBuffers->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsBuffers->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsBuffers->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsBuffers->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsBuffers->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsBuffers->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsBuffers->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); alpaka::wait(queue); } return segmentsInCPU; diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 4b431d33..0a24a210 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -40,9 +40,11 @@ namespace SDL int dev; int nTotalSegments; struct objectRanges* rangesInGPU; - struct hits* hitsInGPU; + struct hits* hitsInGPU; + struct hitsBuffer* hitsBuffers; struct miniDoublets* mdsInGPU; - struct segments* segmentsInGPU; + struct segments* segmentsInGPU; + struct segmentsBuffer* segmentsBuffers; struct triplets* tripletsInGPU; struct quintuplets* quintupletsInGPU; struct trackCandidates* trackCandidatesInGPU; @@ -51,9 +53,9 @@ namespace SDL //CPU interface stuff objectRanges* rangesInCPU; - hits* hitsInCPU; + hitsBuffer* hitsInCPU; miniDoublets* mdsInCPU; - segments* segmentsInCPU; + segmentsBuffer* segmentsInCPU; triplets* tripletsInCPU; trackCandidates* trackCandidatesInCPU; modules* modulesInCPU; @@ -130,10 +132,10 @@ namespace SDL unsigned int getNumberOfT3T3ExtendedTracks(); objectRanges* getRanges(); - hits* getHits(); - hits* getHitsInCMSSW(); + hitsBuffer* getHits(); + hitsBuffer* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); - segments* getSegments() ; + segmentsBuffer* getSegments() ; triplets* getTriplets(); quintuplets* getQuintuplets(); trackCandidates* getTrackCandidates(); diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh index a2502d11..61a26cbd 100644 --- a/SDL/Hit.cuh +++ b/SDL/Hit.cuh @@ -6,29 +6,8 @@ namespace SDL { - template struct hits { - Buf nHits_buf; - Buf xs_buf; - Buf ys_buf; - Buf zs_buf; - Buf moduleIndices_buf; - Buf idxs_buf; - Buf detid_buf; - Buf rts_buf; - Buf phis_buf; - Buf etas_buf; - Buf highEdgeXs_buf; - Buf highEdgeYs_buf; - Buf lowEdgeXs_buf; - Buf lowEdgeYs_buf; - Buf hitRanges_buf; - Buf hitRangesLower_buf; - Buf hitRangesUpper_buf; - Buf hitRangesnLower_buf; - Buf hitRangesnUpper_buf; - unsigned int* nHits; float* xs; float* ys; @@ -49,11 +28,59 @@ namespace SDL int8_t* hitRangesnLower; int8_t* hitRangesnUpper; + template + void setData(TBuff& hitsbuf) + { + nHits = alpaka::getPtrNative(hitsbuf.nHits_buf); + xs = alpaka::getPtrNative(hitsbuf.xs_buf); + ys = alpaka::getPtrNative(hitsbuf.ys_buf); + zs = alpaka::getPtrNative(hitsbuf.zs_buf); + moduleIndices = alpaka::getPtrNative(hitsbuf.moduleIndices_buf); + idxs = alpaka::getPtrNative(hitsbuf.idxs_buf); + detid = alpaka::getPtrNative(hitsbuf.detid_buf); + rts = alpaka::getPtrNative(hitsbuf.rts_buf); + phis = alpaka::getPtrNative(hitsbuf.phis_buf); + etas = alpaka::getPtrNative(hitsbuf.etas_buf); + highEdgeXs = alpaka::getPtrNative(hitsbuf.highEdgeXs_buf); + highEdgeYs = alpaka::getPtrNative(hitsbuf.highEdgeYs_buf); + lowEdgeXs = alpaka::getPtrNative(hitsbuf.lowEdgeXs_buf); + lowEdgeYs = alpaka::getPtrNative(hitsbuf.lowEdgeYs_buf); + hitRanges = alpaka::getPtrNative(hitsbuf.hitRanges_buf); + hitRangesLower = alpaka::getPtrNative(hitsbuf.hitRangesLower_buf); + hitRangesUpper = alpaka::getPtrNative(hitsbuf.hitRangesUpper_buf); + hitRangesnLower = alpaka::getPtrNative(hitsbuf.hitRangesnLower_buf); + hitRangesnUpper = alpaka::getPtrNative(hitsbuf.hitRangesnUpper_buf); + } + }; + + template + struct hitsBuffer : hits + { + Buf nHits_buf; + Buf xs_buf; + Buf ys_buf; + Buf zs_buf; + Buf moduleIndices_buf; + Buf idxs_buf; + Buf detid_buf; + Buf rts_buf; + Buf phis_buf; + Buf etas_buf; + Buf highEdgeXs_buf; + Buf highEdgeYs_buf; + Buf lowEdgeXs_buf; + Buf lowEdgeYs_buf; + Buf hitRanges_buf; + Buf hitRangesLower_buf; + Buf hitRangesUpper_buf; + Buf hitRangesnLower_buf; + Buf hitRangesnUpper_buf; + template - hits(unsigned int nModules, - unsigned int nMaxHits, - TDevAcc const & devAccIn, - TQueue& queue) : + hitsBuffer(unsigned int nModules, + unsigned int nMaxHits, + TDevAcc const & devAccIn, + TQueue& queue) : nHits_buf(allocBufWrapper(devAccIn, 1)), xs_buf(allocBufWrapper(devAccIn, nMaxHits)), ys_buf(allocBufWrapper(devAccIn, nMaxHits)), @@ -74,26 +101,6 @@ namespace SDL hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules)), hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules)) { - nHits = alpaka::getPtrNative(nHits_buf); - xs = alpaka::getPtrNative(xs_buf); - ys = alpaka::getPtrNative(ys_buf); - zs = alpaka::getPtrNative(zs_buf); - moduleIndices = alpaka::getPtrNative(moduleIndices_buf); - idxs = alpaka::getPtrNative(idxs_buf); - detid = alpaka::getPtrNative(detid_buf); - rts = alpaka::getPtrNative(rts_buf); - phis = alpaka::getPtrNative(phis_buf); - etas = alpaka::getPtrNative(etas_buf); - highEdgeXs = alpaka::getPtrNative(highEdgeXs_buf); - highEdgeYs = alpaka::getPtrNative(highEdgeYs_buf); - lowEdgeXs = alpaka::getPtrNative(lowEdgeXs_buf); - lowEdgeYs = alpaka::getPtrNative(lowEdgeYs_buf); - hitRanges = alpaka::getPtrNative(hitRanges_buf); - hitRangesLower = alpaka::getPtrNative(hitRangesLower_buf); - hitRangesUpper = alpaka::getPtrNative(hitRangesUpper_buf); - hitRangesnLower = alpaka::getPtrNative(hitRangesnLower_buf); - hitRangesnUpper = alpaka::getPtrNative(hitRangesnUpper_buf); - alpaka::memset(queue, hitRanges_buf, -1, nModules*2); alpaka::memset(queue, hitRangesLower_buf, -1, nModules); alpaka::memset(queue, hitRangesUpper_buf, -1, nModules); @@ -213,7 +220,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, int const & nLowerModules) const { using Dim = alpaka::Dim; @@ -250,7 +257,7 @@ namespace SDL unsigned int* geoMapDetId, // DetId's from endcap map float* geoMapPhi, // Phi values from endcap map struct SDL::modules& modulesInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, unsigned int const & nHits) const // Total number of hits in event { using Dim = alpaka::Dim; diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index 4cc1310d..8fd7d952 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -27,8 +27,7 @@ namespace SDL pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = 1; }; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex) { segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1; }; @@ -453,7 +452,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, bool secondpass) const { using Dim = alpaka::Dim; diff --git a/SDL/LST.cc b/SDL/LST.cc index 83481428..a46a6167 100644 --- a/SDL/LST.cc +++ b/SDL/LST.cc @@ -403,7 +403,7 @@ void SDL::LST::getOutput(SDL::Event& event) { std::vector tc_seedIdx_; std::vector tc_trackCandidateType_; - SDL::hits& hitsInGPU = (*event.getHitsInCMSSW()); + SDL::hitsBuffer& hitsInGPU = (*event.getHitsInCMSSW()); SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW()); unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates; diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index cfc81f55..850f01d8 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -68,8 +68,7 @@ namespace SDL void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream); - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx) { //the index into which this MD needs to be written will be computed in the kernel //nMDs variable will be incremented in the kernel, no need to worry about that here @@ -660,7 +659,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct SDL::modules& modulesInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::objectRanges& rangesInGPU) const { @@ -804,7 +803,7 @@ namespace SDL struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct objectRanges& rangesInGPU, - struct SDL::hits& hitsInGPU) const + struct SDL::hits& hitsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index f7aaa54e..660aaeb7 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -48,8 +48,7 @@ namespace SDL void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) { pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex; pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex; @@ -131,7 +130,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; @@ -664,7 +663,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) { bool pass = true; @@ -769,7 +768,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, unsigned int* connectedPixelSize, @@ -912,7 +911,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -1125,7 +1124,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -1386,8 +1385,7 @@ namespace SDL void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream); - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) { pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex; pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index; @@ -1968,7 +1966,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) { bool pass = true; @@ -2102,7 +2100,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, @@ -2228,7 +2226,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -2435,7 +2433,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -2650,7 +2648,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index b4fe6b3d..dc7893a0 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -645,7 +645,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) { unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex]; @@ -1205,7 +1205,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -1398,7 +1398,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -1609,7 +1609,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1816,7 +1816,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1874,7 +1874,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag) { bool pass = true; unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex]; @@ -2078,7 +2078,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::objectRanges& rangesInGPU, diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 85a1df54..88f5718f 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -10,84 +10,30 @@ namespace SDL { - template struct segments { - // Buffer objects for each member variable - Buf dPhis_buf; - Buf dPhiMins_buf; - Buf dPhiMaxs_buf; - Buf dPhiChanges_buf; - Buf dPhiChangeMins_buf; - Buf dPhiChangeMaxs_buf; - - Buf innerLowerModuleIndices_buf; - Buf outerLowerModuleIndices_buf; - - Buf seedIdx_buf; - Buf mdIndices_buf; - Buf nMemoryLocations_buf; - Buf innerMiniDoubletAnchorHitIndices_buf; - Buf outerMiniDoubletAnchorHitIndices_buf; - - Buf charge_buf; - Buf superbin_buf; - Buf nSegments_buf; - Buf totOccupancySegments_buf; - - Buf pLSHitsIdxs_buf; - - Buf pixelType_buf; - - Buf isQuad_buf; - - Buf isDup_buf; - Buf partOfPT5_buf; - - Buf ptIn_buf; - Buf ptErr_buf; - Buf px_buf; - Buf py_buf; - Buf pz_buf; - Buf etaErr_buf; - Buf eta_buf; - Buf phi_buf; - Buf score_buf; - Buf circleCenterX_buf; - Buf circleCenterY_buf; - Buf circleRadius_buf; - - // Pointers towards the data of each buffer FPX* dPhis; FPX* dPhiMins; FPX* dPhiMaxs; FPX* dPhiChanges; FPX* dPhiChangeMins; FPX* dPhiChangeMaxs; - uint16_t* innerLowerModuleIndices; uint16_t* outerLowerModuleIndices; - unsigned int* seedIdx; unsigned int* mdIndices; unsigned int* nMemoryLocations; unsigned int* innerMiniDoubletAnchorHitIndices; unsigned int* outerMiniDoubletAnchorHitIndices; - int* charge; int* superbin; int* nSegments; //number of segments per inner lower module int* totOccupancySegments; //number of segments per inner lower module - uint4* pLSHitsIdxs; - int8_t* pixelType; - char* isQuad; - bool* isDup; bool* partOfPT5; - float* ptIn; float* ptErr; float* px; @@ -101,12 +47,90 @@ namespace SDL float* circleCenterY; float* circleRadius; + template + void setData(TBuff& segmentsbuf) + { + dPhis = alpaka::getPtrNative(segmentsbuf.dPhis_buf); + dPhiMins = alpaka::getPtrNative(segmentsbuf.dPhiMins_buf); + dPhiMaxs = alpaka::getPtrNative(segmentsbuf.dPhiMaxs_buf); + dPhiChanges = alpaka::getPtrNative(segmentsbuf.dPhiChanges_buf); + dPhiChangeMins = alpaka::getPtrNative(segmentsbuf.dPhiChangeMins_buf); + dPhiChangeMaxs = alpaka::getPtrNative(segmentsbuf.dPhiChangeMaxs_buf); + innerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.innerLowerModuleIndices_buf); + outerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.outerLowerModuleIndices_buf); + seedIdx = alpaka::getPtrNative(segmentsbuf.seedIdx_buf); + mdIndices = alpaka::getPtrNative(segmentsbuf.mdIndices_buf); + nMemoryLocations = alpaka::getPtrNative(segmentsbuf.nMemoryLocations_buf); + innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.innerMiniDoubletAnchorHitIndices_buf); + outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.outerMiniDoubletAnchorHitIndices_buf); + charge = alpaka::getPtrNative(segmentsbuf.charge_buf); + superbin = alpaka::getPtrNative(segmentsbuf.superbin_buf); + nSegments = alpaka::getPtrNative(segmentsbuf.nSegments_buf); + totOccupancySegments = alpaka::getPtrNative(segmentsbuf.totOccupancySegments_buf); + pLSHitsIdxs = alpaka::getPtrNative(segmentsbuf.pLSHitsIdxs_buf); + pixelType = alpaka::getPtrNative(segmentsbuf.pixelType_buf); + isQuad = alpaka::getPtrNative(segmentsbuf.isQuad_buf); + isDup = alpaka::getPtrNative(segmentsbuf.isDup_buf); + partOfPT5 = alpaka::getPtrNative(segmentsbuf.partOfPT5_buf); + ptIn = alpaka::getPtrNative(segmentsbuf.ptIn_buf); + ptErr = alpaka::getPtrNative(segmentsbuf.ptErr_buf); + px = alpaka::getPtrNative(segmentsbuf.px_buf); + py = alpaka::getPtrNative(segmentsbuf.py_buf); + pz = alpaka::getPtrNative(segmentsbuf.pz_buf); + etaErr = alpaka::getPtrNative(segmentsbuf.etaErr_buf); + eta = alpaka::getPtrNative(segmentsbuf.eta_buf); + phi = alpaka::getPtrNative(segmentsbuf.phi_buf); + score = alpaka::getPtrNative(segmentsbuf.score_buf); + circleCenterX = alpaka::getPtrNative(segmentsbuf.circleCenterX_buf); + circleCenterY = alpaka::getPtrNative(segmentsbuf.circleCenterY_buf); + circleRadius = alpaka::getPtrNative(segmentsbuf.circleRadius_buf); + } + }; + + template + struct segmentsBuffer : segments + { + Buf dPhis_buf; + Buf dPhiMins_buf; + Buf dPhiMaxs_buf; + Buf dPhiChanges_buf; + Buf dPhiChangeMins_buf; + Buf dPhiChangeMaxs_buf; + Buf innerLowerModuleIndices_buf; + Buf outerLowerModuleIndices_buf; + Buf seedIdx_buf; + Buf mdIndices_buf; + Buf nMemoryLocations_buf; + Buf innerMiniDoubletAnchorHitIndices_buf; + Buf outerMiniDoubletAnchorHitIndices_buf; + Buf charge_buf; + Buf superbin_buf; + Buf nSegments_buf; + Buf totOccupancySegments_buf; + Buf pLSHitsIdxs_buf; + Buf pixelType_buf; + Buf isQuad_buf; + Buf isDup_buf; + Buf partOfPT5_buf; + Buf ptIn_buf; + Buf ptErr_buf; + Buf px_buf; + Buf py_buf; + Buf pz_buf; + Buf etaErr_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_buf; + Buf circleCenterX_buf; + Buf circleCenterY_buf; + Buf circleRadius_buf; + template - segments(unsigned int nMemoryLocationsIn, - uint16_t nLowerModules, - unsigned int maxPixelSegments, - TDevAcc const & devAccIn, - TQueue& queue) : + segmentsBuffer(unsigned int nMemoryLocationsIn, + uint16_t nLowerModules, + unsigned int maxPixelSegments, + TDevAcc const & devAccIn, + TQueue& queue) : dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), @@ -142,49 +166,6 @@ namespace SDL circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments)), circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments)) { - dPhis = alpaka::getPtrNative(dPhis_buf); - dPhiMins = alpaka::getPtrNative(dPhiMins_buf); - dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf); - dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf); - dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf); - dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf); - - innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf); - outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf); - - seedIdx = alpaka::getPtrNative(seedIdx_buf); - mdIndices = alpaka::getPtrNative(mdIndices_buf); - nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf); - innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf); - outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf); - - charge = alpaka::getPtrNative(charge_buf); - superbin = alpaka::getPtrNative(superbin_buf); - nSegments = alpaka::getPtrNative(nSegments_buf); - totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf); - - pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf); - - pixelType = alpaka::getPtrNative(pixelType_buf); - - isQuad = alpaka::getPtrNative(isQuad_buf); - - isDup = alpaka::getPtrNative(isDup_buf); - partOfPT5 = alpaka::getPtrNative(partOfPT5_buf); - - ptIn = alpaka::getPtrNative(ptIn_buf); - ptErr = alpaka::getPtrNative(ptErr_buf); - px = alpaka::getPtrNative(px_buf); - py = alpaka::getPtrNative(py_buf); - pz = alpaka::getPtrNative(pz_buf); - etaErr = alpaka::getPtrNative(etaErr_buf); - eta = alpaka::getPtrNative(eta_buf); - phi = alpaka::getPtrNative(phi_buf); - score = alpaka::getPtrNative(score_buf); - circleCenterX = alpaka::getPtrNative(circleCenterX_buf); - circleCenterY = alpaka::getPtrNative(circleCenterY_buf); - circleRadius = alpaka::getPtrNative(circleRadius_buf); - alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1); alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1); alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments); @@ -450,8 +431,7 @@ namespace SDL dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); }; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx) { //idx will be computed in the kernel, which is the index into which the //segment will be written @@ -473,7 +453,7 @@ namespace SDL } template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score) { segmentsInGPU.mdIndices[idx * 2] = innerMDIndex; segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex; @@ -704,7 +684,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -856,7 +836,7 @@ namespace SDL ALPAKA_FN_ACC void operator()( TAcc const & acc, struct modules& modulesInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -890,9 +870,9 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, unsigned int* hitIndices0, unsigned int* hitIndices1, unsigned int* hitIndices2, diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index b4564106..d81a570d 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -81,8 +81,7 @@ namespace SDL trackCandidatesInGPU.radius[trackCandidateIndex] = __F2H(radius); }; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU) { int phits1[4] = {-1,-1,-1,-1}; int phits2[4] = {-1,-1,-1,-1}; @@ -128,7 +127,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const { using Dim = alpaka::Dim; @@ -243,9 +242,9 @@ namespace SDL struct SDL::objectRanges& rangesInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::hits& hitsInGPU, + struct SDL::hits& hitsInGPU, struct SDL::quintuplets& quintupletsInGPU) const { using Dim = alpaka::Dim; @@ -328,7 +327,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelTriplets& pixelTripletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; @@ -399,7 +398,7 @@ namespace SDL TAcc const & acc, uint16_t nLowerModules, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU) const + struct SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; @@ -429,7 +428,7 @@ namespace SDL uint16_t nLowerModules, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, struct SDL::trackCandidates& trackCandidatesInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::objectRanges& rangesInGPU) const { using Dim = alpaka::Dim; diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 1b1d1063..8a9fc96f 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -59,11 +59,9 @@ namespace SDL void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream); #ifdef CUT_VALUE_DEBUG - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) #else - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) #endif { tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex; @@ -110,7 +108,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) { //get the rt and z const float& r1 = mdsInGPU.anchorRt[firstMDIndex]; @@ -191,7 +189,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -250,7 +248,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex; @@ -329,7 +327,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -409,7 +407,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; @@ -490,7 +488,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -686,7 +684,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -901,7 +899,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1110,7 +1108,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1175,7 +1173,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1202,7 +1200,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::objectRanges& rangesInGPU, uint16_t *index_gpu, @@ -1277,7 +1275,7 @@ namespace SDL TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, - struct SDL::segments& segmentsInGPU) const + struct SDL::segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index df6caf6a..3ab470fd 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -7,7 +7,7 @@ //____________________________________________________________________________________________ std::tuple, std::vector> convertHitsToHitIdxsAndHitTypes(SDL::Event* event, std::vector hits) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); std::vector hitidxs; std::vector hittypes; for (auto& hit : hits) @@ -28,7 +28,7 @@ std::tuple, std::vector> convertHitsToHi //____________________________________________________________________________________________ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS) { - SDL::segments& segments_ = *(event->getSegments()); + SDL::segmentsBuffer& segments_ = *(event->getSegments()); SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -48,7 +48,7 @@ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pL //____________________________________________________________________________________________ std::vector getPixelHitIdxsFrompLS(SDL::Event* event, unsigned int pLS) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); std::vector hits = getPixelHitsFrompLS(event, pLS); std::vector hitidxs; for (auto& hit : hits) @@ -96,7 +96,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getMDsFromLS(SDL::Event* event, unsigned int LS) { - SDL::segments& segments_ = *(event->getSegments()); + SDL::segmentsBuffer& segments_ = *(event->getSegments()); unsigned int MD_1 = segments_.mdIndices[2 * LS]; unsigned int MD_2 = segments_.mdIndices[2 * LS + 1]; return {MD_1, MD_2}; @@ -203,7 +203,7 @@ std::vector getHitsFromT5(SDL::Event* event, unsigned int T5) //____________________________________________________________________________________________ std::vector getHitIdxsFromT5(SDL::Event* event, unsigned int T5) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFromT5(event, T5); std::vector hitidxs; for (auto& hit : hits) @@ -215,7 +215,7 @@ std::vector getModuleIdxsFromT5(SDL::Event* event, unsigned int T5 { std::vector hits = getHitsFromT5(event, T5); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); @@ -297,7 +297,7 @@ std::vector getHitsFrompT3(SDL::Event* event, unsigned int pT3) //____________________________________________________________________________________________ std::vector getHitIdxsFrompT3(SDL::Event* event, unsigned int pT3) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFrompT3(event, pT3); std::vector hitidxs; for (auto& hit : hits) @@ -309,7 +309,7 @@ std::vector getModuleIdxsFrompT3(SDL::Event* event, unsigned int p { std::vector hits = getOuterTrackerHitsFrompT3(event, pT3); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); @@ -405,7 +405,7 @@ std::vector getHitsFrompT5(SDL::Event* event, unsigned int pT5) //____________________________________________________________________________________________ std::vector getHitIdxsFrompT5(SDL::Event* event, unsigned int pT5) { - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); std::vector hits = getHitsFrompT5(event, pT5); std::vector hitidxs; for (auto& hit : hits) @@ -418,7 +418,7 @@ std::vector getModuleIdxsFrompT5(SDL::Event* event, unsigned int p { std::vector hits = getOuterTrackerHitsFrompT5(event, pT5); std::vector module_idxs; - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); for(auto &hitIdx:hits) { module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]); diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index cce8b6f1..43d7831f 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) // ============ pT5 ============= SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::modules& modulesInGPU = (*event->getModules()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -476,8 +476,8 @@ void setPixelTripletOutputBranches(SDL::Event* event) SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets()); SDL::triplets& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); - SDL::segments& segmentsInGPU = *(event->getSegments()); - SDL::hits& hitsInGPU = *(event->getHits()); + SDL::segmentsBuffer& segmentsInGPU = *(event->getSegments()); + SDL::hitsBuffer& hitsInGPU = *(event->getHits()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; @@ -559,9 +559,9 @@ void setPixelTripletOutputBranches(SDL::Event* event) void setGnnNtupleBranches(SDL::Event* event) { // Get relevant information - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); @@ -717,7 +717,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD) { // Get relevant information SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); // Get the hit indices unsigned int hit0 = miniDoubletsInGPU.anchorHitIndices[MD]; @@ -821,8 +821,8 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); // // pictorial representation of a pT5 @@ -959,8 +959,8 @@ std::tuple, vector> pars // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); // // pictorial representation of a pT3 @@ -1006,7 +1006,7 @@ std::tuple, vector> pars { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx]; std::vector T3s = getT3sFromT5(event, T5); std::vector hits = getHitsFromT5(event, T5); @@ -1059,7 +1059,7 @@ std::tuple, vector> pars std::tuple, vector> parsepLS(SDL::Event* event, unsigned int idx) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); // Getting pLS index unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx]; @@ -1151,7 +1151,7 @@ void printAllObjects(SDL::Event* event) void printMDs(SDL::Event* event) { SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1173,9 +1173,9 @@ void printMDs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printLSs(SDL::Event* event) { - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1206,9 +1206,9 @@ void printLSs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printpLSs(SDL::Event* event) { - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); @@ -1237,9 +1237,9 @@ void printpLSs(SDL::Event* event) void printT3s(SDL::Event* event) { SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::hits& hitsInGPU = (*event->getHits()); + SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); int nTriplets = 0; for (unsigned int i = 0; i < *(modulesInGPU.nLowerModules); ++i) @@ -1280,7 +1280,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::triplets& tripletsInGPU = (*event->getTriplets()); - SDL::segments& segmentsInGPU = (*event->getSegments()); + SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRanges& rangesInGPU = (*event->getRanges()); From d42ebb047a71ac73bd17749a87eddcc5651159b5 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 7 Jun 2023 19:11:34 -0400 Subject: [PATCH 17/44] Move objectranges to Alpaka memory --- SDL/Event.cu | 70 +++++++---------- SDL/Event.cuh | 5 +- SDL/Module.cu | 140 ---------------------------------- SDL/Module.cuh | 116 +++++++++++++++++++++++++++- code/core/AccessHelper.cc | 6 +- code/core/write_sdl_ntuple.cc | 14 ++-- 6 files changed, 153 insertions(+), 198 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index 01036c57..3d7d0805 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -60,7 +60,6 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx SDL::Event::~Event() { #ifdef CACHE_ALLOC - if(rangesInGPU){rangesInGPU->freeMemoryCache();} if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} @@ -68,7 +67,6 @@ SDL::Event::~Event() if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else - if(rangesInGPU){rangesInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} @@ -76,7 +74,7 @@ SDL::Event::~Event() if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);} #endif - if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);} + if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);} @@ -92,7 +90,6 @@ SDL::Event::~Event() } if(rangesInCPU != nullptr) { - delete[] rangesInCPU->quintupletModuleIndices; delete rangesInCPU; } @@ -240,14 +237,12 @@ void SDL::Event::resetEvent() #ifdef CACHE_ALLOC if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} - if(rangesInGPU){rangesInGPU->freeMemoryCache();} if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} - if(rangesInGPU){rangesInGPU->freeMemory();} if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} @@ -277,7 +272,7 @@ void SDL::Event::resetEvent() hitsInGPU = nullptr;} if(mdsInGPU){cms::cuda::free_host(mdsInGPU); mdsInGPU = nullptr;} - if(rangesInGPU){cms::cuda::free_host(rangesInGPU); + if(rangesInGPU){delete rangesInGPU; delete rangesBuffers; rangesInGPU = nullptr;} if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers; segmentsInGPU = nullptr;} @@ -299,8 +294,6 @@ void SDL::Event::resetEvent() } if(rangesInCPU != nullptr) { - delete[] rangesInCPU->hitRanges; - delete[] rangesInCPU->quintupletModuleIndices; delete rangesInCPU; rangesInCPU = nullptr; } @@ -440,7 +433,6 @@ void SDL::initModules(const char* moduleMetaDataFilePath) loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath); cudaStreamSynchronize(default_stream); } - //resetObjectRanges(*modulesInGPU,nModules, default_stream); } void SDL::cleanModules() @@ -450,11 +442,6 @@ void SDL::cleanModules() cudaFreeHost(pixelMapping); } -void SDL::Event::resetObjectsInModule() -{ - resetObjectRanges(*rangesInGPU,nModules,stream); -} - void SDL::Event::addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple) { // Use the actual number of hits instead of a max. @@ -477,9 +464,22 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: if (rangesInGPU == nullptr) { - rangesInGPU = (SDL::objectRanges*)cms::cuda::allocate_host(sizeof(SDL::objectRanges), stream); - createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules); - resetObjectsInModule(); + rangesInGPU = new SDL::objectRanges(); + rangesBuffers = new SDL::objectRangesBuffer(nModules, nLowerModules, devAcc, queue); + rangesInGPU->setData(*rangesBuffers); + } + + unsigned int hostValue; + + // Copy from device to host + cudaError_t err = cudaMemcpy(&hostValue, &rangesInGPU->hitRangesnUpper[0], sizeof(int8_t), cudaMemcpyDeviceToHost); + + // Check for errors + if (err != cudaSuccess) { + printf("cudaMemcpy failed with error: %s\n", cudaGetErrorString(err)); + } else { + // Print the value + printf("The value is: %u\n", hostValue); } // Copy the host arrays to the GPU. @@ -754,10 +754,8 @@ void SDL::Event::createMiniDoublets() if(mdsInGPU == nullptr) { mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream); - //FIXME: Add memory locations for pixel MDs createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, stream); - } cudaStreamSynchronize(stream); @@ -1254,14 +1252,6 @@ void SDL::Event::createPixelTriplets() void SDL::Event::createQuintuplets() { uint16_t nEligibleT5Modules = 0; - -#ifdef CACHE_ALLOC - rangesInGPU->indicesOfEligibleT5Modules = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(uint16_t), stream); -#else - cudaMalloc(&(rangesInGPU->indicesOfEligibleT5Modules), nLowerModules * sizeof(uint16_t)); -#endif - cudaMemsetAsync(rangesInGPU->quintupletModuleIndices, -1, sizeof(int) * (nLowerModules),stream); - cudaStreamSynchronize(stream); unsigned int nTotalQuintuplets; Vec const threadsPerBlockCreateQuints(static_cast(1), static_cast(1), static_cast(1024)); @@ -1308,7 +1298,6 @@ void SDL::Event::createQuintuplets() nEligibleT5Modules)); alpaka::enqueue(queue, createQuintupletsInGPUv2Task); - alpaka::wait(queue); Vec const threadsPerBlockDupQuint(static_cast(1), static_cast(16), static_cast(16)); Vec const blocksPerGridDupQuint(static_cast(MAX_BLOCKS), static_cast(1), static_cast(1)); @@ -1859,22 +1848,19 @@ SDL::hitsBuffer* SDL::Event::getHitsInCMSSW() return hitsInCPU; } -SDL::objectRanges* SDL::Event::getRanges() +SDL::objectRangesBuffer* SDL::Event::getRanges() { if(rangesInCPU == nullptr) { - rangesInCPU = new SDL::objectRanges; - rangesInCPU->hitRanges = new int[2*nModules]; - rangesInCPU->quintupletModuleIndices = new int[nLowerModules]; - cudaMemcpyAsync(rangesInCPU->hitRanges, hitsBuffers->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream); - rangesInCPU->miniDoubletModuleIndices = new int[nLowerModules+1]; - rangesInCPU->segmentModuleIndices = new int[nLowerModules + 1]; - rangesInCPU->tripletModuleIndices = new int[nLowerModules]; - cudaMemcpyAsync(rangesInCPU->quintupletModuleIndices, rangesInGPU->quintupletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(rangesInCPU->miniDoubletModuleIndices, rangesInGPU->miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(rangesInCPU->segmentModuleIndices, rangesInGPU->segmentModuleIndices, (nLowerModules + 1) * sizeof(int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(rangesInCPU->tripletModuleIndices, rangesInGPU->tripletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); + rangesInCPU = new SDL::objectRangesBuffer(nModules, nLowerModules, devHost, queue); + rangesInCPU->setData(*rangesInCPU); + + alpaka::memcpy(queue, rangesInCPU->hitRanges_buf, rangesBuffers->hitRanges_buf, 2 * nModules); + alpaka::memcpy(queue, rangesInCPU->quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules); + alpaka::memcpy(queue, rangesInCPU->miniDoubletModuleIndices_buf, rangesBuffers->miniDoubletModuleIndices_buf, nLowerModules + 1); + alpaka::memcpy(queue, rangesInCPU->segmentModuleIndices_buf, rangesBuffers->segmentModuleIndices_buf, nLowerModules + 1); + alpaka::memcpy(queue, rangesInCPU->tripletModuleIndices_buf, rangesBuffers->tripletModuleIndices_buf, nLowerModules); + alpaka::wait(queue); } return rangesInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 0a24a210..8a84a449 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -40,6 +40,7 @@ namespace SDL int dev; int nTotalSegments; struct objectRanges* rangesInGPU; + struct objectRangesBuffer* rangesBuffers; struct hits* hitsInGPU; struct hitsBuffer* hitsBuffers; struct miniDoublets* mdsInGPU; @@ -52,7 +53,7 @@ namespace SDL struct pixelQuintuplets* pixelQuintupletsInGPU; //CPU interface stuff - objectRanges* rangesInCPU; + objectRangesBuffer* rangesInCPU; hitsBuffer* hitsInCPU; miniDoublets* mdsInCPU; segmentsBuffer* segmentsInCPU; @@ -131,7 +132,7 @@ namespace SDL unsigned int getNumberOfExtendedTracks(); unsigned int getNumberOfT3T3ExtendedTracks(); - objectRanges* getRanges(); + objectRangesBuffer* getRanges(); hitsBuffer* getHits(); hitsBuffer* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); diff --git a/SDL/Module.cu b/SDL/Module.cu index e26eb899..01c0b162 100644 --- a/SDL/Module.cu +++ b/SDL/Module.cu @@ -7,70 +7,6 @@ std::map *SDL::module_z; std::map *SDL::module_type; // 23 : Ph2PSP, 24 : Ph2PSS, 25 : Ph2SS // https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29 -void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules) -{ - /* modules stucture object will be created in Event.cu*/ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - rangesInGPU.hitRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.hitRangesLower = (int*)cms::cuda::allocate_device(dev,nModules * sizeof(int),stream); - rangesInGPU.hitRangesUpper = (int*)cms::cuda::allocate_device(dev,nModules * sizeof(int),stream); - rangesInGPU.hitRangesnLower = (int8_t*)cms::cuda::allocate_device(dev,nModules * sizeof(int8_t),stream); - rangesInGPU.hitRangesnUpper = (int8_t*)cms::cuda::allocate_device(dev,nModules * sizeof(int8_t),stream); - rangesInGPU.mdRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.segmentRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.trackletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.tripletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.trackCandidateRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.quintupletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream); - rangesInGPU.nEligibleT5Modules = (uint16_t*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream); - - rangesInGPU.quintupletModuleIndices = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream); - rangesInGPU.quintupletModuleOccupancy = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream); - rangesInGPU.miniDoubletModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream); - rangesInGPU.miniDoubletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream); - rangesInGPU.segmentModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream); - rangesInGPU.segmentModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream); - rangesInGPU.tripletModuleIndices = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream); - rangesInGPU.tripletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream); - - rangesInGPU.device_nTotalMDs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - rangesInGPU.device_nTotalSegs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - rangesInGPU.device_nTotalTrips = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - rangesInGPU.device_nTotalQuints = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - -#else - cudaMalloc(&rangesInGPU.hitRanges,nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.hitRangesLower,nModules * sizeof(int)); - cudaMalloc(&rangesInGPU.hitRangesUpper,nModules * sizeof(int)); - cudaMalloc(&rangesInGPU.hitRangesnLower,nModules * sizeof(int8_t)); - cudaMalloc(&rangesInGPU.hitRangesnUpper,nModules * sizeof(int8_t)); - cudaMalloc(&rangesInGPU.mdRanges,nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.segmentRanges,nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.trackletRanges,nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.tripletRanges,nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.trackCandidateRanges, nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.quintupletRanges, nModules * 2 * sizeof(int)); - cudaMalloc(&rangesInGPU.nEligibleT5Modules, sizeof(uint16_t)); - cudaMalloc(&rangesInGPU.quintupletModuleIndices, nLowerModules * sizeof(int)); - cudaMalloc(&rangesInGPU.quintupletModuleOccupancy, nLowerModules * sizeof(int)); - - cudaMalloc(&rangesInGPU.miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&rangesInGPU.miniDoubletModuleOccupancy, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&rangesInGPU.segmentModuleIndices, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&rangesInGPU.segmentModuleOccupancy, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&rangesInGPU.tripletModuleIndices, nLowerModules * sizeof(int)); - cudaMalloc(&rangesInGPU.tripletModuleOccupancy, nLowerModules * sizeof(int)); - - cudaMalloc(&rangesInGPU.device_nTotalMDs, sizeof(unsigned int)); - cudaMalloc(&rangesInGPU.device_nTotalSegs, sizeof(unsigned int)); - cudaMalloc(&rangesInGPU.device_nTotalTrips, sizeof(unsigned int)); - cudaMalloc(&rangesInGPU.device_nTotalQuints, sizeof(unsigned int)); - -#endif -} - void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream) { /* modules stucture object will be created in Event.cu*/ @@ -103,66 +39,6 @@ void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned in cudaStreamSynchronize(stream); } -void SDL::objectRanges::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,hitRanges); - cms::cuda::free_device(dev,mdRanges); - cms::cuda::free_device(dev,segmentRanges); - cms::cuda::free_device(dev,trackletRanges); - cms::cuda::free_device(dev,tripletRanges); - cms::cuda::free_device(dev,trackCandidateRanges); - cms::cuda::free_device(dev,quintupletRanges); - cms::cuda::free_device(dev,nEligibleT5Modules); - cms::cuda::free_device(dev, indicesOfEligibleT5Modules); - cms::cuda::free_device(dev,quintupletModuleIndices); - cms::cuda::free_device(dev,quintupletModuleOccupancy); - cms::cuda::free_device(dev, hitRangesLower); - cms::cuda::free_device(dev, hitRangesUpper); - cms::cuda::free_device(dev, hitRangesnLower); - cms::cuda::free_device(dev, hitRangesnUpper); - cms::cuda::free_device(dev, miniDoubletModuleIndices); - cms::cuda::free_device(dev, miniDoubletModuleOccupancy); - cms::cuda::free_device(dev, segmentModuleIndices); - cms::cuda::free_device(dev, segmentModuleOccupancy); - cms::cuda::free_device(dev, tripletModuleIndices); - cms::cuda::free_device(dev, tripletModuleOccupancy); - cms::cuda::free_device(dev, device_nTotalMDs); - cms::cuda::free_device(dev, device_nTotalSegs); - cms::cuda::free_device(dev, device_nTotalTrips); - cms::cuda::free_device(dev, device_nTotalQuints); -} - -void SDL::objectRanges::freeMemory() -{ - cudaFree(hitRanges); - cudaFree(hitRangesLower); - cudaFree(hitRangesUpper); - cudaFree(hitRangesnLower); - cudaFree(hitRangesnUpper); - cudaFree(mdRanges); - cudaFree(segmentRanges); - cudaFree(trackletRanges); - cudaFree(tripletRanges); - cudaFree(trackCandidateRanges); - cudaFree(quintupletRanges); - cudaFree(nEligibleT5Modules); - cudaFree(indicesOfEligibleT5Modules); - cudaFree(quintupletModuleIndices); - cudaFree(quintupletModuleOccupancy); - cudaFree(miniDoubletModuleIndices); - cudaFree(miniDoubletModuleOccupancy); - cudaFree(segmentModuleIndices); - cudaFree(segmentModuleOccupancy); - cudaFree(tripletModuleIndices); - cudaFree(tripletModuleOccupancy); - cudaFree(device_nTotalMDs); - cudaFree(device_nTotalSegs); - cudaFree(device_nTotalTrips); - cudaFree(device_nTotalQuints); -} - void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping) { cudaFree(modulesInGPU.detIds); @@ -716,19 +592,3 @@ unsigned int SDL::modules::parsePartnerModuleId(unsigned int detId, bool isLower { return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1); } - -void SDL::resetObjectRanges(struct objectRanges& rangesInGPU, unsigned int nModules,cudaStream_t stream) -{ - cudaMemsetAsync(rangesInGPU.hitRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.hitRangesLower, -1,nModules*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.hitRangesUpper, -1,nModules*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.hitRangesnLower, -1,nModules*sizeof(int8_t),stream); - cudaMemsetAsync(rangesInGPU.hitRangesnUpper, -1,nModules*sizeof(int8_t),stream); - cudaMemsetAsync(rangesInGPU.mdRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.segmentRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.trackletRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.tripletRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.trackCandidateRanges, -1,nModules*2*sizeof(int),stream); - cudaMemsetAsync(rangesInGPU.quintupletRanges, -1, nModules*2*sizeof(int),stream); - cudaStreamSynchronize(stream); -} diff --git a/SDL/Module.cuh b/SDL/Module.cuh index 6e48abaf..d4e1457f 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -73,9 +73,118 @@ namespace SDL unsigned int *device_nTotalSegs; unsigned int *device_nTotalTrips; unsigned int *device_nTotalQuints; - - void freeMemoryCache(); - void freeMemory(); + + template + void setData(TBuff& objectRangesbuf) + { + hitRanges = alpaka::getPtrNative(objectRangesbuf.hitRanges_buf); + hitRangesLower = alpaka::getPtrNative(objectRangesbuf.hitRangesLower_buf); + hitRangesUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesUpper_buf); + hitRangesnLower = alpaka::getPtrNative(objectRangesbuf.hitRangesnLower_buf); + hitRangesnUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesnUpper_buf); + mdRanges = alpaka::getPtrNative(objectRangesbuf.mdRanges_buf); + segmentRanges = alpaka::getPtrNative(objectRangesbuf.segmentRanges_buf); + trackletRanges = alpaka::getPtrNative(objectRangesbuf.trackletRanges_buf); + tripletRanges = alpaka::getPtrNative(objectRangesbuf.tripletRanges_buf); + trackCandidateRanges = alpaka::getPtrNative(objectRangesbuf.trackCandidateRanges_buf); + quintupletRanges = alpaka::getPtrNative(objectRangesbuf.quintupletRanges_buf); + + nEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.nEligibleT5Modules_buf); + indicesOfEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.indicesOfEligibleT5Modules_buf); + + quintupletModuleIndices = alpaka::getPtrNative(objectRangesbuf.quintupletModuleIndices_buf); + quintupletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.quintupletModuleOccupancy_buf); + miniDoubletModuleIndices = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleIndices_buf); + miniDoubletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleOccupancy_buf); + segmentModuleIndices = alpaka::getPtrNative(objectRangesbuf.segmentModuleIndices_buf); + segmentModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.segmentModuleOccupancy_buf); + tripletModuleIndices = alpaka::getPtrNative(objectRangesbuf.tripletModuleIndices_buf); + tripletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.tripletModuleOccupancy_buf); + + device_nTotalMDs = alpaka::getPtrNative(objectRangesbuf.device_nTotalMDs_buf); + device_nTotalSegs = alpaka::getPtrNative(objectRangesbuf.device_nTotalSegs_buf); + device_nTotalTrips = alpaka::getPtrNative(objectRangesbuf.device_nTotalTrips_buf); + device_nTotalQuints = alpaka::getPtrNative(objectRangesbuf.device_nTotalQuints_buf); + } + }; + + template + struct objectRangesBuffer : objectRanges + { + Buf hitRanges_buf; + Buf hitRangesLower_buf; + Buf hitRangesUpper_buf; + Buf hitRangesnLower_buf; + Buf hitRangesnUpper_buf; + Buf mdRanges_buf; + Buf segmentRanges_buf; + Buf trackletRanges_buf; + Buf tripletRanges_buf; + Buf trackCandidateRanges_buf; + Buf quintupletRanges_buf; + + Buf nEligibleT5Modules_buf; + Buf indicesOfEligibleT5Modules_buf; + + Buf quintupletModuleIndices_buf; + Buf quintupletModuleOccupancy_buf; + Buf miniDoubletModuleIndices_buf; + Buf miniDoubletModuleOccupancy_buf; + Buf segmentModuleIndices_buf; + Buf segmentModuleOccupancy_buf; + Buf tripletModuleIndices_buf; + Buf tripletModuleOccupancy_buf; + + Buf device_nTotalMDs_buf; + Buf device_nTotalSegs_buf; + Buf device_nTotalTrips_buf; + Buf device_nTotalQuints_buf; + + template + objectRangesBuffer(unsigned int nModules, + unsigned int nLowerModules, + TDevAcc const & devAccIn, + TQueue& queue) : + hitRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules)), + mdRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + segmentRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + trackletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + tripletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + trackCandidateRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + quintupletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + nEligibleT5Modules_buf(allocBufWrapper(devAccIn, 1)), + indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerModules)), + quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules)), + quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules)), + miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules)), + tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules)), + device_nTotalMDs_buf(allocBufWrapper(devAccIn, 1)), + device_nTotalSegs_buf(allocBufWrapper(devAccIn, 1)), + device_nTotalTrips_buf(allocBufWrapper(devAccIn, 1)), + device_nTotalQuints_buf(allocBufWrapper(devAccIn, 1)) + { + alpaka::memset(queue, hitRanges_buf, -1, nModules*2); + alpaka::memset(queue, hitRangesLower_buf, -1, nModules); + alpaka::memset(queue, hitRangesUpper_buf, -1, nModules); + alpaka::memset(queue, hitRangesnLower_buf, -1, nModules); + alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules); + alpaka::memset(queue, mdRanges_buf, -1, nModules*2); + alpaka::memset(queue, segmentRanges_buf, -1, nModules*2); + alpaka::memset(queue, trackletRanges_buf, -1, nModules*2); + alpaka::memset(queue, tripletRanges_buf, -1, nModules*2); + alpaka::memset(queue, trackCandidateRanges_buf, -1, nModules*2); + alpaka::memset(queue, quintupletRanges_buf, -1, nModules*2); + alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerModules); + alpaka::wait(queue); + } }; struct modules @@ -148,7 +257,6 @@ namespace SDL void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); void fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules); void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r); - void resetObjectRanges(struct objectRanges& rangesInGPU, unsigned int nModules,cudaStream_t stream); void createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules); } #endif diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 3ab470fd..83e7f51d 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -30,7 +30,7 @@ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pL { SDL::segmentsBuffer& segments_ = *(event->getSegments()); SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; unsigned int MD_1 = segments_.mdIndices[2 * (pLS + pLS_offset)]; @@ -242,7 +242,7 @@ std::tuple, std::vector> getHitIdxsAndHi unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3) { SDL::pixelTriplets& pixelTriplets_ = *(event->getPixelTriplets()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; return pixelTriplets_.pixelSegmentIndices[pT3] - pLS_offset; @@ -342,7 +342,7 @@ std::tuple, std::vector> getHitIdxsAndHi unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5) { SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; return pixelQuintuplets_.pixelIndices[pT5] - pLS_offset; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 43d7831f..15a8eb39 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -392,7 +392,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) void setQuintupletOutputBranches(SDL::Event* event) { SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const float kRinv1GeVf = (2.99792458e-3 * 3.8); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -563,7 +563,7 @@ void setGnnNtupleBranches(SDL::Event* event) SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); std::set mds_used_in_sg; @@ -1107,7 +1107,7 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f void printHitMultiplicities(SDL::Event* event) { SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); int nHits = 0; for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); idx++) // "<=" because cheating to include pixel track candidate lower module @@ -1153,7 +1153,7 @@ void printMDs(SDL::Event* event) SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); // Then obtain the lower module index for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx) @@ -1177,7 +1177,7 @@ void printLSs(SDL::Event* event) SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); int nSegments = 0; for (unsigned int i = 0; i < *(modulesInGPU.nLowerModules); ++i) @@ -1210,7 +1210,7 @@ void printpLSs(SDL::Event* event) SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); unsigned int i = *(modulesInGPU.nLowerModules); unsigned int idx = i;//modulesInGPU.lowerModuleIndices[i]; @@ -1283,7 +1283,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::modules& modulesInGPU = (*event->getModules()); - SDL::objectRanges& rangesInGPU = (*event->getRanges()); + SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); //int nTrackCandidates = 0; for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx) { From a3f89d11353c4ee2fb3468bf6fe5499d34616f4f Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 7 Jun 2023 19:15:26 -0400 Subject: [PATCH 18/44] remove debug --- SDL/Event.cu | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index 3d7d0805..e6b4dd52 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -469,19 +469,6 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: rangesInGPU->setData(*rangesBuffers); } - unsigned int hostValue; - - // Copy from device to host - cudaError_t err = cudaMemcpy(&hostValue, &rangesInGPU->hitRangesnUpper[0], sizeof(int8_t), cudaMemcpyDeviceToHost); - - // Check for errors - if (err != cudaSuccess) { - printf("cudaMemcpy failed with error: %s\n", cudaGetErrorString(err)); - } else { - // Print the value - printf("The value is: %u\n", hostValue); - } - // Copy the host arrays to the GPU. alpaka::memcpy(queue, hitsBuffers->xs_buf, x, nHits); alpaka::memcpy(queue, hitsBuffers->ys_buf, y, nHits); From b6324bcdc1fe78db99d7a86fadf626aae3ee4287 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 7 Jun 2023 23:16:23 -0400 Subject: [PATCH 19/44] Move triplets to Alpaka memory --- SDL/Event.cu | 130 +++++++------------------ SDL/Event.cuh | 5 +- SDL/Hit.cuh | 6 +- SDL/MiniDoublet.cuh | 1 - SDL/Quintuplet.cuh | 1 - SDL/Segment.cuh | 1 - SDL/Triplet.cu | 176 ---------------------------------- SDL/Triplet.cuh | 123 +++++++++++++++++++++--- code/core/AccessHelper.cc | 2 +- code/core/write_sdl_ntuple.cc | 12 +-- 10 files changed, 162 insertions(+), 295 deletions(-) delete mode 100644 SDL/Triplet.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index e6b4dd52..8139af57 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -11,7 +11,6 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx int driver; cudaRuntimeGetVersion(&version); cudaDriverGetVersion(&driver); - //printf("version: %d Driver %d\n",version, driver); stream = estream; addObjects = verbose; hitsInGPU = nullptr; @@ -61,14 +60,12 @@ SDL::Event::~Event() { #ifdef CACHE_ALLOC if(mdsInGPU){mdsInGPU->freeMemoryCache();} - if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(mdsInGPU){mdsInGPU->freeMemory(stream);} - if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} @@ -77,7 +74,7 @@ SDL::Event::~Event() if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} - if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);} + if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;} if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);} if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} @@ -109,30 +106,6 @@ SDL::Event::~Event() if(tripletsInCPU != nullptr) { - delete[] tripletsInCPU->segmentIndices; - delete[] tripletsInCPU->nTriplets; - delete[] tripletsInCPU->totOccupancyTriplets; - delete[] tripletsInCPU->betaIn; - delete[] tripletsInCPU->betaOut; - delete[] tripletsInCPU->pt_beta; - delete[] tripletsInCPU->hitIndices; - delete[] tripletsInCPU->logicalLayers; - delete[] tripletsInCPU->lowerModuleIndices; - delete tripletsInCPU->nMemoryLocations; -#ifdef CUT_VALUE_DEBUG - delete[] tripletsInCPU->zOut; - delete[] tripletsInCPU->zLo; - delete[] tripletsInCPU->zHi; - delete[] tripletsInCPU->zLoPointed; - delete[] tripletsInCPU->zHiPointed; - delete[] tripletsInCPU->sdlCut; - delete[] tripletsInCPU->betaInCut; - delete[] tripletsInCPU->betaOutCut; - delete[] tripletsInCPU->deltaBetaCut; - delete[] tripletsInCPU->rtLo; - delete[] tripletsInCPU->rtHi; - delete[] tripletsInCPU->kZ; -#endif delete tripletsInCPU; } if(quintupletsInCPU != nullptr) @@ -237,14 +210,12 @@ void SDL::Event::resetEvent() #ifdef CACHE_ALLOC if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} - if(tripletsInGPU){tripletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(mdsInGPU){mdsInGPU->freeMemory(stream);} - if(tripletsInGPU){tripletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);} @@ -276,7 +247,7 @@ void SDL::Event::resetEvent() rangesInGPU = nullptr;} if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers; segmentsInGPU = nullptr;} - if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU); + if(tripletsInGPU){delete tripletsInGPU; delete tripletsBuffers; tripletsInGPU = nullptr;} if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU); quintupletsInGPU = nullptr;} @@ -312,15 +283,6 @@ void SDL::Event::resetEvent() } if(tripletsInCPU != nullptr) { - delete[] tripletsInCPU->segmentIndices; - delete[] tripletsInCPU->nTriplets; - delete[] tripletsInCPU->totOccupancyTriplets; - delete[] tripletsInCPU->betaIn; - delete[] tripletsInCPU->betaOut; - delete[] tripletsInCPU->pt_beta; - delete[] tripletsInCPU->logicalLayers; - delete[] tripletsInCPU->lowerModuleIndices; - delete[] tripletsInCPU->hitIndices; delete tripletsInCPU; tripletsInCPU = nullptr; } @@ -865,7 +827,6 @@ void SDL::Event::createTriplets() { if(tripletsInGPU == nullptr) { - tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host(sizeof(SDL::triplets), stream); unsigned int maxTriplets; Vec const threadsPerBlockCreateTrip(static_cast(1), static_cast(1), static_cast(1024)); @@ -886,11 +847,14 @@ void SDL::Event::createTriplets() cudaMemcpyAsync(&maxTriplets,rangesInGPU->device_nTotalTrips,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); - createTripletsInExplicitMemory(*tripletsInGPU, maxTriplets, nLowerModules,stream); + tripletsInGPU = new SDL::triplets(); + tripletsBuffers = new SDL::tripletsBuffer(maxTriplets, nLowerModules, devAcc, queue); + tripletsInGPU->setData(*tripletsBuffers); cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); } + //TODO:Move this also inside the ranges function uint16_t nonZeroModules=0; unsigned int max_InnerSeg=0; @@ -1909,63 +1873,43 @@ SDL::segmentsBuffer* SDL::Event::getSegments() return segmentsInCPU; } -SDL::triplets* SDL::Event::getTriplets() +SDL::tripletsBuffer* SDL::Event::getTriplets() { if(tripletsInCPU == nullptr) { - tripletsInCPU = new SDL::triplets; - tripletsInCPU->nMemoryLocations = new unsigned int; - cudaMemcpyAsync(tripletsInCPU->nMemoryLocations, tripletsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); + // Get nMemoryLocations parameter to initilize host based tripletsInCPU + auto nMemLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nMemLocal_buf, tripletsBuffers->nMemoryLocations_buf, 1); + alpaka::wait(queue); + + unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); + tripletsInCPU = new SDL::tripletsBuffer(nMemLocal, nLowerModules, devHost, queue); + tripletsInCPU->setData(*tripletsInCPU); - tripletsInCPU->segmentIndices = new unsigned[2 * *(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->nTriplets = new int[nLowerModules]; - tripletsInCPU->betaIn = new FPX[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->betaOut = new FPX[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->pt_beta = new FPX[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->hitIndices = new unsigned int[6 * *(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->logicalLayers = new uint8_t[3 * *(tripletsInCPU->nMemoryLocations)]; + *alpaka::getPtrNative(tripletsInCPU->nMemoryLocations_buf) = nMemLocal; #ifdef CUT_VALUE_DEBUG - tripletsInCPU->zOut = new float[4 * *(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->zLo = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->zHi = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->zLoPointed = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->zHiPointed = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->sdlCut = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->betaInCut = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->betaOutCut = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->deltaBetaCut = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->rtLo = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->rtHi = new float[*(tripletsInCPU->nMemoryLocations)]; - tripletsInCPU->kZ = new float[*(tripletsInCPU->nMemoryLocations)]; - - tripletsInCPU->rtOut = tripletsInCPU->zOut + *(tripletsInCPU->nMemoryLocations); - tripletsInCPU->deltaPhiPos = tripletsInCPU->zOut + 2 * *(tripletsInCPU->nMemoryLocations); - tripletsInCPU->deltaPhi = tripletsInCPU->zOut + 3 * *(tripletsInCPU->nMemoryLocations); - - cudaMemcpyAsync(tripletsInCPU->zOut, tripletsInGPU->zOut, 4 * * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->zLo, tripletsInGPU->zLo, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->zHi, tripletsInGPU->zHi, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->zLoPointed, tripletsInGPU->zLoPointed, 4 * * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->zHiPointed, tripletsInGPU->zHiPointed, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->sdlCut, tripletsInGPU->sdlCut, *(tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->betaInCut, tripletsInGPU->betaInCut, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->betaOutCut, tripletsInGPU->betaOutCut, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->deltaBetaCut, tripletsInGPU->deltaBetaCut, *(tripletsInCPU->nMemoryLocations)*sizeof(unsigned int), cudaMemcpyDeviceToHost); - cudaMemcpyAsync(tripletsInCPU->rtLo, tripletsInGPU->rtLo, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->rtHi, tripletsInGPU->rtHi, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->kZ, tripletsInGPU->kZ, * (tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); + alpaka::memcpy(queue, tripletsInCPU->zOut_buf, tripletsBuffers->zOut_buf, 4 * nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->zLo_buf, tripletsBuffers->zLo_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->zHi_buf, tripletsBuffers->zHi_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->zLoPointed_buf, tripletsBuffers->zLoPointed_buf, 4 * nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->zHiPointed_buf, tripletsBuffers->zHiPointed_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->sdlCut_buf, tripletsBuffers->sdlCut_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->betaInCut_buf, tripletsBuffers->betaInCut_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->betaOutCut_buf, tripletsBuffers->betaOutCut_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->deltaBetaCut_buf, tripletsBuffers->deltaBetaCut_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->rtLo_buf, tripletsBuffers->rtLo_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->rtHi_buf, tripletsBuffers->rtHi_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->kZ_buf, tripletsBuffers->kZ_buf, nMemLocal); #endif - cudaMemcpyAsync(tripletsInCPU->hitIndices, tripletsInGPU->hitIndices, 6 * *(tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->logicalLayers, tripletsInGPU->logicalLayers, 3 * *(tripletsInCPU->nMemoryLocations) * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(tripletsInCPU->segmentIndices, tripletsInGPU->segmentIndices, 2 * *(tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(tripletsInCPU->betaIn, tripletsInGPU->betaIn, *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(tripletsInCPU->betaOut, tripletsInGPU->betaOut, *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(tripletsInCPU->pt_beta, tripletsInGPU->pt_beta, *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - tripletsInCPU->totOccupancyTriplets = new int[nLowerModules]; - cudaMemcpyAsync(tripletsInCPU->nTriplets, tripletsInGPU->nTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(tripletsInCPU->totOccupancyTriplets, tripletsInGPU->totOccupancyTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, tripletsInCPU->hitIndices_buf, tripletsBuffers->hitIndices_buf, 6 * nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->logicalLayers_buf, tripletsBuffers->logicalLayers_buf, 3 * nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->segmentIndices_buf, tripletsBuffers->segmentIndices_buf, 2 * nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->betaIn_buf, tripletsBuffers->betaIn_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->betaOut_buf, tripletsBuffers->betaOut_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->pt_beta_buf, tripletsBuffers->pt_beta_buf, nMemLocal); + alpaka::memcpy(queue, tripletsInCPU->nTriplets_buf, tripletsBuffers->nTriplets_buf, nLowerModules); + alpaka::memcpy(queue, tripletsInCPU->totOccupancyTriplets_buf, tripletsBuffers->totOccupancyTriplets_buf, nLowerModules); + alpaka::wait(queue); } return tripletsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 8a84a449..8d694888 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -47,6 +47,7 @@ namespace SDL struct segments* segmentsInGPU; struct segmentsBuffer* segmentsBuffers; struct triplets* tripletsInGPU; + struct tripletsBuffer* tripletsBuffers; struct quintuplets* quintupletsInGPU; struct trackCandidates* trackCandidatesInGPU; struct pixelTriplets* pixelTripletsInGPU; @@ -57,7 +58,7 @@ namespace SDL hitsBuffer* hitsInCPU; miniDoublets* mdsInCPU; segmentsBuffer* segmentsInCPU; - triplets* tripletsInCPU; + tripletsBuffer* tripletsInCPU; trackCandidates* trackCandidatesInCPU; modules* modulesInCPU; modules* modulesInCPUFull; @@ -137,7 +138,7 @@ namespace SDL hitsBuffer* getHitsInCMSSW(); miniDoublets* getMiniDoublets(); segmentsBuffer* getSegments() ; - triplets* getTriplets(); + tripletsBuffer* getTriplets(); quintuplets* getQuintuplets(); trackCandidates* getTrackCandidates(); trackCandidates* getTrackCandidatesInCMSSW(); diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh index 61a26cbd..8e69bc96 100644 --- a/SDL/Hit.cuh +++ b/SDL/Hit.cuh @@ -78,9 +78,9 @@ namespace SDL template hitsBuffer(unsigned int nModules, - unsigned int nMaxHits, - TDevAcc const & devAccIn, - TQueue& queue) : + unsigned int nMaxHits, + TDevAcc const & devAccIn, + TQueue& queue) : nHits_buf(allocBufWrapper(devAccIn, 1)), xs_buf(allocBufWrapper(devAccIn, nMaxHits)), ys_buf(allocBufWrapper(devAccIn, nMaxHits)), diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 850f01d8..9f723e2d 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -3,7 +3,6 @@ #include "Constants.cuh" #include "EndcapGeometry.cuh" -#include "TiltedGeometry.h" #include "Module.cuh" #include "Hit.cuh" diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index dc7893a0..0f408a30 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -3,7 +3,6 @@ #include "Constants.cuh" #include "EndcapGeometry.cuh" -#include "TiltedGeometry.h" #include "Segment.cuh" #include "MiniDoublet.cuh" #include "Module.cuh" diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index 88f5718f..d6308cb0 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -3,7 +3,6 @@ #include "Constants.cuh" #include "EndcapGeometry.cuh" -#include "TiltedGeometry.h" #include "MiniDoublet.cuh" #include "Module.cuh" #include "Hit.cuh" diff --git a/SDL/Triplet.cu b/SDL/Triplet.cu deleted file mode 100644 index 218880e2..00000000 --- a/SDL/Triplet.cu +++ /dev/null @@ -1,176 +0,0 @@ -#include "Triplet.cuh" - -void SDL::createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules, cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - tripletsInGPU.segmentIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(unsigned int) *2,stream); - tripletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(uint16_t) *3,stream); - tripletsInGPU.betaIn = (FPX*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(FPX) *3,stream); - tripletsInGPU.nTriplets = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream); - tripletsInGPU.totOccupancyTriplets = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream); - tripletsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream); - tripletsInGPU.partOfPT3 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream); - tripletsInGPU.partOfT5 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream); - - tripletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, maxTriplets * 3 * sizeof(uint8_t), stream); - tripletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, maxTriplets * 6 * sizeof(unsigned int), stream); - tripletsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - -#ifdef CUT_VALUE_DEBUG - tripletsInGPU.zOut = (float*)cms::cuda::allocate_device(dev, maxTriplets * 4 * sizeof(float), stream); - tripletsInGPU.zLo = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.zHi = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.zLoPointed = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.zHiPointed = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.sdlCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.betaInCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.betaOutCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.deltaBetaCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.rtLo = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.rtHi = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.kZ = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream); - tripletsInGPU.rtOut = tripletsInGPU.zOut + maxTriplets; - tripletsInGPU.deltaPhiPos = tripletsInGPU.zOut + maxTriplets *2; - tripletsInGPU.deltaPhi = tripletsInGPU.zOut + maxTriplets *3; -#endif - -#else - cudaMalloc(&tripletsInGPU.segmentIndices, /*5*/2 * maxTriplets * sizeof(unsigned int)); - cudaMalloc(&tripletsInGPU.lowerModuleIndices, 3 * maxTriplets * sizeof(uint16_t)); - cudaMalloc(&tripletsInGPU.betaIn, maxTriplets * 3 * sizeof(FPX)); - cudaMalloc(&tripletsInGPU.nTriplets, nLowerModules * sizeof(int)); - cudaMalloc(&tripletsInGPU.totOccupancyTriplets, nLowerModules * sizeof(int)); - cudaMalloc(&tripletsInGPU.partOfPT5, maxTriplets * sizeof(bool)); - cudaMalloc(&tripletsInGPU.partOfPT3, maxTriplets * sizeof(bool)); - cudaMalloc(&tripletsInGPU.partOfT5, maxTriplets * sizeof(bool)); - - cudaMalloc(&tripletsInGPU.logicalLayers, maxTriplets * 3 * sizeof(uint8_t)); - cudaMalloc(&tripletsInGPU.hitIndices, maxTriplets * 6 * sizeof(unsigned int)); - cudaMalloc(&tripletsInGPU.nMemoryLocations, sizeof(unsigned int)); - -#ifdef CUT_VALUE_DEBUG - cudaMalloc(&tripletsInGPU.zOut, maxTriplets * 4*sizeof(unsigned int)); - cudaMalloc(&tripletsInGPU.zLo, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.zHi, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.zLoPointed, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.zHiPointed, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.sdlCut, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.betaInCut, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.betaOutCut, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.deltaBetaCut, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.rtLo, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.rtHi, maxTriplets * sizeof(float)); - cudaMalloc(&tripletsInGPU.kZ, maxTriplets * sizeof(float)); - - tripletsInGPU.rtOut = tripletsInGPU.zOut + maxTriplets; - tripletsInGPU.deltaPhiPos = tripletsInGPU.zOut + maxTriplets *2; - tripletsInGPU.deltaPhi = tripletsInGPU.zOut + maxTriplets *3; -#endif - -#endif - cudaMemsetAsync(tripletsInGPU.nTriplets,0,nLowerModules * sizeof(int),stream); - cudaMemsetAsync(tripletsInGPU.totOccupancyTriplets,0,nLowerModules * sizeof(int),stream); - cudaMemsetAsync(tripletsInGPU.partOfPT5,0,maxTriplets * sizeof(bool),stream); - cudaMemsetAsync(tripletsInGPU.partOfPT3,0,maxTriplets * sizeof(bool),stream); - cudaMemsetAsync(tripletsInGPU.partOfT5,0,maxTriplets * sizeof(bool),stream); - - cudaStreamSynchronize(stream); - - tripletsInGPU.betaOut = tripletsInGPU.betaIn + maxTriplets; - tripletsInGPU.pt_beta = tripletsInGPU.betaIn + maxTriplets * 2; -} - -SDL::triplets::triplets() -{ - segmentIndices = nullptr; - lowerModuleIndices = nullptr; - betaIn = nullptr; - betaOut = nullptr; - pt_beta = nullptr; - logicalLayers = nullptr; - hitIndices = nullptr; -#ifdef CUT_VALUE_DEBUG - zOut = nullptr; - rtOut = nullptr; - deltaPhiPos = nullptr; - deltaPhi = nullptr; - zLo = nullptr; - zHi = nullptr; - rtLo = nullptr; - rtHi = nullptr; - zLoPointed = nullptr; - zHiPointed = nullptr; - kZ = nullptr; - betaInCut = nullptr; - betaOutCut = nullptr; - deltaBetaCut = nullptr; - sdlCut = nullptr; -#endif -} - -SDL::triplets::~triplets() -{ -} - -void SDL::triplets::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,segmentIndices); - cms::cuda::free_device(dev,lowerModuleIndices); - cms::cuda::free_device(dev,betaIn); - cms::cuda::free_device(dev,nTriplets); - cms::cuda::free_device(dev,totOccupancyTriplets); - cms::cuda::free_device(dev, partOfPT5); - cms::cuda::free_device(dev, partOfPT3); - cms::cuda::free_device(dev, partOfT5); - cms::cuda::free_device(dev, logicalLayers); - cms::cuda::free_device(dev, hitIndices); - cms::cuda::free_device(dev, nMemoryLocations); -#ifdef CUT_VALUE_DEBUG - cms::cuda::free_device(dev, zOut); - cms::cuda::free_device(dev, zLo); - cms::cuda::free_device(dev, zHi); - cms::cuda::free_device(dev, zLoPointed); - cms::cuda::free_device(dev, zHiPointed); - cms::cuda::free_device(dev, sdlCut); - cms::cuda::free_device(dev, betaInCut); - cms::cuda::free_device(dev, betaOutCut); - cms::cuda::free_device(dev, deltaBetaCut); - cms::cuda::free_device(dev, rtLo); - cms::cuda::free_device(dev, rtHi); - cms::cuda::free_device(dev, kZ); -#endif -} - -void SDL::triplets::freeMemory(cudaStream_t stream) -{ - cudaFree(segmentIndices); - cudaFree(lowerModuleIndices); - cudaFree(nTriplets); - cudaFree(totOccupancyTriplets); - cudaFree(betaIn); - cudaFree(partOfPT5); - cudaFree(partOfPT3); - cudaFree(partOfT5); - cudaFree(logicalLayers); - cudaFree(hitIndices); - cudaFree(nMemoryLocations); -#ifdef CUT_VALUE_DEBUG - cudaFree(zOut); - cudaFree(zLo); - cudaFree(zHi); - cudaFree(rtLo); - cudaFree(rtHi); - cudaFree(zLoPointed); - cudaFree(zHiPointed); - cudaFree(kZ); - cudaFree(betaInCut); - cudaFree(betaOutCut); - cudaFree(deltaBetaCut); - cudaFree(sdlCut); -#endif - cudaStreamSynchronize(stream); -} \ No newline at end of file diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 8a9fc96f..d5b5a81b 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -15,17 +15,13 @@ namespace SDL uint16_t* lowerModuleIndices; //3 of them now int* nTriplets; int* totOccupancyTriplets; - unsigned int* nMemoryLocations; - uint8_t* logicalLayers; unsigned int* hitIndices; - //delta beta = betaIn - betaOut FPX* betaIn; FPX* betaOut; FPX* pt_beta; - bool* partOfPT5; bool* partOfT5; bool* partOfPT3; @@ -34,10 +30,8 @@ namespace SDL //debug variables float* zOut; float* rtOut; - float* deltaPhiPos; float* deltaPhi; - float* zLo; float* zHi; float* zLoPointed; @@ -50,13 +44,120 @@ namespace SDL float* rtHi; float* kZ; #endif - triplets(); - ~triplets(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + void setData(TBuff& tripletsbuf) + { + segmentIndices = alpaka::getPtrNative(tripletsbuf.segmentIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(tripletsbuf.lowerModuleIndices_buf); + nTriplets = alpaka::getPtrNative(tripletsbuf.nTriplets_buf); + totOccupancyTriplets = alpaka::getPtrNative(tripletsbuf.totOccupancyTriplets_buf); + nMemoryLocations = alpaka::getPtrNative(tripletsbuf.nMemoryLocations_buf); + logicalLayers = alpaka::getPtrNative(tripletsbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(tripletsbuf.hitIndices_buf); + betaIn = alpaka::getPtrNative(tripletsbuf.betaIn_buf); + betaOut = alpaka::getPtrNative(tripletsbuf.betaOut_buf); + pt_beta = alpaka::getPtrNative(tripletsbuf.pt_beta_buf); + partOfPT5 = alpaka::getPtrNative(tripletsbuf.partOfPT5_buf); + partOfT5 = alpaka::getPtrNative(tripletsbuf.partOfT5_buf); + partOfPT3 = alpaka::getPtrNative(tripletsbuf.partOfPT3_buf); +#ifdef CUT_VALUE_DEBUG + zOut = alpaka::getPtrNative(tripletsbuf.zOut_buf); + rtOut = alpaka::getPtrNative(tripletsbuf.rtOut_buf); + deltaPhiPos = alpaka::getPtrNative(tripletsbuf.deltaPhiPos_buf); + deltaPhi = alpaka::getPtrNative(tripletsbuf.deltaPhi_buf); + zLo = alpaka::getPtrNative(tripletsbuf.zLo_buf); + zHi = alpaka::getPtrNative(tripletsbuf.zHi_buf); + zLoPointed = alpaka::getPtrNative(tripletsbuf.zLoPointed_buf); + zHiPointed = alpaka::getPtrNative(tripletsbuf.zHiPointed_buf); + sdlCut = alpaka::getPtrNative(tripletsbuf.sdlCut_buf); + betaInCut = alpaka::getPtrNative(tripletsbuf.betaInCut_buf); + betaOutCut = alpaka::getPtrNative(tripletsbuf.betaOutCut_buf); + deltaBetaCut = alpaka::getPtrNative(tripletsbuf.deltaBetaCut_buf); + rtLo = alpaka::getPtrNative(tripletsbuf.rtLo_buf); + rtHi = alpaka::getPtrNative(tripletsbuf.rtHi_buf); + kZ = alpaka::getPtrNative(tripletsbuf.kZ_buf); +#endif + } }; - void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream); + template + struct tripletsBuffer : triplets + { + Buf segmentIndices_buf; + Buf lowerModuleIndices_buf; + Buf nTriplets_buf; + Buf totOccupancyTriplets_buf; + Buf nMemoryLocations_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf betaIn_buf; + Buf betaOut_buf; + Buf pt_beta_buf; + Buf partOfPT5_buf; + Buf partOfT5_buf; + Buf partOfPT3_buf; + +#ifdef CUT_VALUE_DEBUG + Buf zOut_buf; + Buf rtOut_buf; + Buf deltaPhiPos_buf; + Buf deltaPhi_buf; + Buf zLo_buf; + Buf zHi_buf; + Buf zLoPointed_buf; + Buf zHiPointed_buf; + Buf sdlCut_buf; + Buf betaInCut_buf; + Buf betaOutCut_buf; + Buf deltaBetaCut_buf; + Buf rtLo_buf; + Buf rtHi_buf; + Buf kZ_buf; +#endif + + template + tripletsBuffer(unsigned int maxTriplets, + unsigned int nLowerModules, + TDevAcc const & devAccIn, + TQueue& queue) : + segmentIndices_buf(allocBufWrapper(devAccIn, 2 * maxTriplets)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 3 * maxTriplets)), + nTriplets_buf(allocBufWrapper(devAccIn, nLowerModules)), + totOccupancyTriplets_buf(allocBufWrapper(devAccIn, nLowerModules)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxTriplets * 3)), + hitIndices_buf(allocBufWrapper(devAccIn, maxTriplets * 6)), + betaIn_buf(allocBufWrapper(devAccIn, maxTriplets)), + betaOut_buf(allocBufWrapper(devAccIn, maxTriplets)), + pt_beta_buf(allocBufWrapper(devAccIn, maxTriplets)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxTriplets)), + partOfT5_buf(allocBufWrapper(devAccIn, maxTriplets)), + partOfPT3_buf(allocBufWrapper(devAccIn, maxTriplets)) +#ifdef CUT_VALUE_DEBUG + ,zOut_buf(allocBufWrapper(devAccIn, maxTriplets)), + rtOut_buf(allocBufWrapper(devAccIn, maxTriplets)), + deltaPhiPos_buf(allocBufWrapper(devAccIn, maxTriplets)), + deltaPhi_buf(allocBufWrapper(devAccIn, maxTriplets)), + zLo_buf(allocBufWrapper(devAccIn, maxTriplets)), + zHi_buf(allocBufWrapper(devAccIn, maxTriplets)), + zLoPointed_buf(allocBufWrapper(devAccIn, maxTriplets)), + zHiPointed_buf(allocBufWrapper(devAccIn, maxTriplets)), + sdlCut_buf(allocBufWrapper(devAccIn, maxTriplets)), + betaInCut_buf(allocBufWrapper(devAccIn, maxTriplets)), + betaOutCut_buf(allocBufWrapper(devAccIn, maxTriplets)), + deltaBetaCut_buf(allocBufWrapper(devAccIn, maxTriplets)), + rtLo_buf(allocBufWrapper(devAccIn, maxTriplets)), + rtHi_buf(allocBufWrapper(devAccIn, maxTriplets)), + kZ_buf(allocBufWrapper(devAccIn, maxTriplets)) +#endif + { + alpaka::memset(queue, nTriplets_buf, 0, nLowerModules); + alpaka::memset(queue, totOccupancyTriplets_buf, 0, nLowerModules); + alpaka::memset(queue, partOfPT5_buf, 0, maxTriplets); + alpaka::memset(queue, partOfT5_buf, 0, maxTriplets); + alpaka::memset(queue, partOfPT3_buf, 0, maxTriplets); + } + }; #ifdef CUT_VALUE_DEBUG ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 83e7f51d..763200ea 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -124,7 +124,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getLSsFromT3(SDL::Event* event, unsigned int T3) { - SDL::triplets& triplets_ = *(event->getTriplets()); + SDL::tripletsBuffer& triplets_ = *(event->getTriplets()); unsigned int LS_1 = triplets_.segmentIndices[2 * T3]; unsigned int LS_2 = triplets_.segmentIndices[2 * T3 + 1]; return {LS_1, LS_2}; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 15a8eb39..e1284b34 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -474,7 +474,7 @@ void setQuintupletOutputBranches(SDL::Event* event) void setPixelTripletOutputBranches(SDL::Event* event) { SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets()); - SDL::triplets& tripletsInGPU = *(event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); SDL::segmentsBuffer& segmentsInGPU = *(event->getSegments()); SDL::hitsBuffer& hitsInGPU = *(event->getHits()); @@ -820,7 +820,7 @@ std::tuple, vector> pars { // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::triplets& tripletsInGPU = (*event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); @@ -958,7 +958,7 @@ std::tuple, vector> pars { // Get relevant information SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::triplets& tripletsInGPU = (*event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); @@ -1005,7 +1005,7 @@ std::tuple, vector> pars std::tuple, vector> parseT5(SDL::Event* event, unsigned int idx) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::triplets& tripletsInGPU = (*event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx]; std::vector T3s = getT3sFromT5(event, T5); @@ -1236,7 +1236,7 @@ void printpLSs(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printT3s(SDL::Event* event) { - SDL::triplets& tripletsInGPU = (*event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); @@ -1279,7 +1279,7 @@ void printT3s(SDL::Event* event) void debugPrintOutlierMultiplicities(SDL::Event* event) { SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); - SDL::triplets& tripletsInGPU = (*event->getTriplets()); + SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::modules& modulesInGPU = (*event->getModules()); From 8901df6be13034975a37e76625fb9b4ed0fc62e5 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 7 Jun 2023 23:44:06 -0400 Subject: [PATCH 20/44] formatting fixes --- SDL/PixelTriplet.cuh | 24 ++++++++++++------------ SDL/Triplet.cuh | 28 ++++++++++++++-------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 660aaeb7..422d8959 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -48,7 +48,7 @@ namespace SDL void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) { pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex; pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex; @@ -130,7 +130,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; @@ -663,7 +663,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true) { bool pass = true; @@ -768,7 +768,7 @@ namespace SDL struct SDL::modules& modulesInGPU, struct SDL::objectRanges& rangesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::pixelTriplets& pixelTripletsInGPU, unsigned int* connectedPixelSize, @@ -911,7 +911,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -1124,7 +1124,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -1385,7 +1385,7 @@ namespace SDL void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream); - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) { pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex; pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index; @@ -1966,7 +1966,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex) { bool pass = true; @@ -2100,7 +2100,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::quintuplets& quintupletsInGPU, struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, @@ -2226,7 +2226,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments { bool pass = true; @@ -2433,7 +2433,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments { bool pass = true; bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); @@ -2648,7 +2648,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { zLo = -999; zHi = -999; diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index d5b5a81b..6df7b06a 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -160,9 +160,9 @@ namespace SDL }; #ifdef CUT_VALUE_DEBUG - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex) #else - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex) #endif { tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex; @@ -209,7 +209,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) { //get the rt and z const float& r1 = mdsInGPU.anchorRt[firstMDIndex]; @@ -290,7 +290,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -349,7 +349,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex; @@ -428,7 +428,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { bool pass = true; bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -508,7 +508,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut) { short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; @@ -589,7 +589,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut) { bool pass = true; @@ -785,7 +785,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); @@ -1000,7 +1000,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1209,7 +1209,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = false; @@ -1274,7 +1274,7 @@ namespace SDL }; template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) { bool pass = true; @@ -1301,7 +1301,7 @@ namespace SDL TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, - struct SDL::segments& segmentsInGPU, + struct segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, struct SDL::objectRanges& rangesInGPU, uint16_t *index_gpu, @@ -1376,7 +1376,7 @@ namespace SDL TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, - struct SDL::segments& segmentsInGPU) const + struct segments& segmentsInGPU) const { using Dim = alpaka::Dim; using Idx = alpaka::Idx; From 24a56cd64083886efd35b28ed5f4b8b7a6f15dbe Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 10:36:00 -0400 Subject: [PATCH 21/44] move quintuplets to Alpaka memory --- SDL/Event.cu | 102 ++++++++---------------- SDL/Event.cuh | 5 +- SDL/Quintuplet.cu | 145 ---------------------------------- SDL/Quintuplet.cuh | 102 ++++++++++++++++++++++-- code/core/AccessHelper.cc | 2 +- code/core/write_sdl_ntuple.cc | 6 +- 6 files changed, 135 insertions(+), 227 deletions(-) delete mode 100644 SDL/Quintuplet.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index 8139af57..65774128 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -60,13 +60,11 @@ SDL::Event::~Event() { #ifdef CACHE_ALLOC if(mdsInGPU){mdsInGPU->freeMemoryCache();} - if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(mdsInGPU){mdsInGPU->freeMemory(stream);} - if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);} @@ -79,7 +77,7 @@ SDL::Event::~Event() if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);} - if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);} + if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;} if(hitsInCPU != nullptr) { @@ -110,17 +108,6 @@ SDL::Event::~Event() } if(quintupletsInCPU != nullptr) { - delete[] quintupletsInCPU->tripletIndices; - delete[] quintupletsInCPU->nQuintuplets; - delete[] quintupletsInCPU->totOccupancyQuintuplets; - delete[] quintupletsInCPU->lowerModuleIndices; - delete[] quintupletsInCPU->innerRadius; - delete[] quintupletsInCPU->outerRadius; - delete[] quintupletsInCPU->regressionRadius; - delete[] quintupletsInCPU->bridgeRadius; - delete[] quintupletsInCPU->chiSquared; - delete[] quintupletsInCPU->rzChiSquared; - delete[] quintupletsInCPU->nonAnchorChiSquared; delete quintupletsInCPU; } @@ -209,12 +196,10 @@ void SDL::Event::resetEvent() { #ifdef CACHE_ALLOC if(mdsInGPU){mdsInGPU->freeMemoryCache();} - if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else - if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);} if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} @@ -249,7 +234,7 @@ void SDL::Event::resetEvent() segmentsInGPU = nullptr;} if(tripletsInGPU){delete tripletsInGPU; delete tripletsBuffers; tripletsInGPU = nullptr;} - if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU); + if(quintupletsInGPU){delete quintupletsInGPU; delete quintupletsBuffers; quintupletsInGPU = nullptr;} if(trackCandidatesInGPU){cms::cuda::free_host(trackCandidatesInGPU); trackCandidatesInGPU = nullptr;} @@ -288,17 +273,6 @@ void SDL::Event::resetEvent() } if(quintupletsInCPU != nullptr) { - delete[] quintupletsInCPU->tripletIndices; - delete[] quintupletsInCPU->nQuintuplets; - delete[] quintupletsInCPU->totOccupancyQuintuplets; - delete[] quintupletsInCPU->lowerModuleIndices; - delete[] quintupletsInCPU->innerRadius; - delete[] quintupletsInCPU->outerRadius; - delete[] quintupletsInCPU->regressionRadius; - delete[] quintupletsInCPU->bridgeRadius; - delete[] quintupletsInCPU->chiSquared; - delete[] quintupletsInCPU->rzChiSquared; - delete[] quintupletsInCPU->nonAnchorChiSquared; delete quintupletsInCPU; quintupletsInCPU = nullptr; } @@ -1226,8 +1200,10 @@ void SDL::Event::createQuintuplets() if(quintupletsInGPU == nullptr) { - quintupletsInGPU = (SDL::quintuplets*)cms::cuda::allocate_host(sizeof(SDL::quintuplets), stream); - createQuintupletsInExplicitMemory(*quintupletsInGPU, nTotalQuintuplets, nLowerModules, nEligibleT5Modules,stream); + quintupletsInGPU = new SDL::quintuplets(); + quintupletsBuffers = new SDL::quintupletsBuffer(nTotalQuintuplets, nLowerModules, devAcc, queue); + quintupletsInGPU->setData(*quintupletsBuffers); + cudaMemcpyAsync(quintupletsInGPU->nMemoryLocations, &nTotalQuintuplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); } @@ -1914,51 +1890,35 @@ SDL::tripletsBuffer* SDL::Event::getTriplets() return tripletsInCPU; } -SDL::quintuplets* SDL::Event::getQuintuplets() +SDL::quintupletsBuffer* SDL::Event::getQuintuplets() { if(quintupletsInCPU == nullptr) { - quintupletsInCPU = new SDL::quintuplets; - uint16_t nEligibleT5Modules; - cudaMemcpyAsync(&nEligibleT5Modules, rangesInGPU->nEligibleT5Modules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - unsigned int nMemoryLocations; - cudaMemcpyAsync(&nMemoryLocations, quintupletsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - quintupletsInCPU->nQuintuplets = new int[nLowerModules]; - quintupletsInCPU->totOccupancyQuintuplets = new int[nLowerModules]; - quintupletsInCPU->tripletIndices = new unsigned int[2 * nMemoryLocations]; - quintupletsInCPU->lowerModuleIndices = new uint16_t[5 * nMemoryLocations]; - quintupletsInCPU->innerRadius = new FPX[nMemoryLocations]; - quintupletsInCPU->outerRadius = new FPX[nMemoryLocations]; - quintupletsInCPU->bridgeRadius = new FPX[nMemoryLocations]; - - quintupletsInCPU->isDup = new bool[nMemoryLocations]; - quintupletsInCPU->score_rphisum = new FPX[nMemoryLocations]; - quintupletsInCPU->eta = new FPX[nMemoryLocations]; - quintupletsInCPU->phi = new FPX[nMemoryLocations]; - - quintupletsInCPU->chiSquared = new float[nMemoryLocations]; - quintupletsInCPU->nonAnchorChiSquared = new float[nMemoryLocations]; - quintupletsInCPU->rzChiSquared = new float[nMemoryLocations]; - - cudaMemcpyAsync(quintupletsInCPU->nQuintuplets, quintupletsInGPU->nQuintuplets, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->totOccupancyQuintuplets, quintupletsInGPU->totOccupancyQuintuplets, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->tripletIndices, quintupletsInGPU->tripletIndices, 2 * nMemoryLocations * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->lowerModuleIndices, quintupletsInGPU->lowerModuleIndices, 5 * nMemoryLocations * sizeof(uint16_t), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->innerRadius, quintupletsInGPU->innerRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->bridgeRadius, quintupletsInGPU->bridgeRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(quintupletsInCPU->outerRadius, quintupletsInGPU->outerRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->isDup, quintupletsInGPU->isDup, nMemoryLocations * sizeof(bool), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->score_rphisum, quintupletsInGPU->score_rphisum, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->eta, quintupletsInGPU->eta, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->phi, quintupletsInGPU->phi, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(quintupletsInCPU->chiSquared, quintupletsInGPU->chiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(quintupletsInCPU->rzChiSquared, quintupletsInGPU->rzChiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(quintupletsInCPU->nonAnchorChiSquared, quintupletsInGPU->nonAnchorChiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream); + // Get nMemoryLocations parameter to initilize host based quintupletsInCPU + auto nMemLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nMemLocal_buf, quintupletsBuffers->nMemoryLocations_buf, 1); + alpaka::wait(queue); - cudaStreamSynchronize(stream); + unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); + quintupletsInCPU = new SDL::quintupletsBuffer(nMemLocal, nLowerModules, devHost, queue); + quintupletsInCPU->setData(*quintupletsInCPU); + + *alpaka::getPtrNative(quintupletsInCPU->nMemoryLocations_buf) = nMemLocal; + alpaka::memcpy(queue, quintupletsInCPU->nQuintuplets_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules); + alpaka::memcpy(queue, quintupletsInCPU->totOccupancyQuintuplets_buf, quintupletsBuffers->totOccupancyQuintuplets_buf, nLowerModules); + alpaka::memcpy(queue, quintupletsInCPU->tripletIndices_buf, quintupletsBuffers->tripletIndices_buf, 2 * nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->lowerModuleIndices_buf, quintupletsBuffers->lowerModuleIndices_buf, 5 * nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->innerRadius_buf, quintupletsBuffers->innerRadius_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->bridgeRadius_buf, quintupletsBuffers->bridgeRadius_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->outerRadius_buf, quintupletsBuffers->outerRadius_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->isDup_buf, quintupletsBuffers->isDup_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->score_rphisum_buf, quintupletsBuffers->score_rphisum_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->eta_buf, quintupletsBuffers->eta_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->phi_buf, quintupletsBuffers->phi_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->chiSquared_buf, quintupletsBuffers->chiSquared_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->rzChiSquared_buf, quintupletsBuffers->rzChiSquared_buf, nMemLocal); + alpaka::memcpy(queue, quintupletsInCPU->nonAnchorChiSquared_buf, quintupletsBuffers->nonAnchorChiSquared_buf, nMemLocal); + alpaka::wait(queue); } return quintupletsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 8d694888..db173f3e 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -49,6 +49,7 @@ namespace SDL struct triplets* tripletsInGPU; struct tripletsBuffer* tripletsBuffers; struct quintuplets* quintupletsInGPU; + struct quintupletsBuffer* quintupletsBuffers; struct trackCandidates* trackCandidatesInGPU; struct pixelTriplets* pixelTripletsInGPU; struct pixelQuintuplets* pixelQuintupletsInGPU; @@ -62,7 +63,7 @@ namespace SDL trackCandidates* trackCandidatesInCPU; modules* modulesInCPU; modules* modulesInCPUFull; - quintuplets* quintupletsInCPU; + quintupletsBuffer* quintupletsInCPU; pixelTriplets* pixelTripletsInCPU; pixelQuintuplets* pixelQuintupletsInCPU; @@ -139,7 +140,7 @@ namespace SDL miniDoublets* getMiniDoublets(); segmentsBuffer* getSegments() ; tripletsBuffer* getTriplets(); - quintuplets* getQuintuplets(); + quintupletsBuffer* getQuintuplets(); trackCandidates* getTrackCandidates(); trackCandidates* getTrackCandidatesInCMSSW(); pixelTriplets* getPixelTriplets(); diff --git a/SDL/Quintuplet.cu b/SDL/Quintuplet.cu deleted file mode 100644 index 2819cf6d..00000000 --- a/SDL/Quintuplet.cu +++ /dev/null @@ -1,145 +0,0 @@ -# include "Quintuplet.cuh" - -SDL::quintuplets::quintuplets() -{ - tripletIndices = nullptr; - lowerModuleIndices = nullptr; - nQuintuplets = nullptr; - totOccupancyQuintuplets = nullptr; - innerRadius = nullptr; - outerRadius = nullptr; - regressionRadius = nullptr; - isDup = nullptr; - TightCutFlag = nullptr; - partOfPT5 = nullptr; - pt = nullptr; - layer = nullptr; - regressionG = nullptr; - regressionF = nullptr; - logicalLayers = nullptr; - hitIndices = nullptr; - bridgeRadius = nullptr; - chiSquared = nullptr; - rzChiSquared = nullptr; - nonAnchorChiSquared = nullptr; -} - -SDL::quintuplets::~quintuplets() -{ -} - -void SDL::quintuplets::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev, tripletIndices); - cms::cuda::free_device(dev, lowerModuleIndices); - cms::cuda::free_device(dev, nQuintuplets); - cms::cuda::free_device(dev, totOccupancyQuintuplets); - cms::cuda::free_device(dev, innerRadius); - cms::cuda::free_device(dev, outerRadius); - cms::cuda::free_device(dev, partOfPT5); - cms::cuda::free_device(dev, isDup); - cms::cuda::free_device(dev, TightCutFlag); - cms::cuda::free_device(dev, pt); - cms::cuda::free_device(dev, layer); - cms::cuda::free_device(dev, regressionG); - cms::cuda::free_device(dev, regressionF); - cms::cuda::free_device(dev, regressionRadius); - cms::cuda::free_device(dev, logicalLayers); - cms::cuda::free_device(dev, hitIndices); - cms::cuda::free_device(dev, nMemoryLocations); - cms::cuda::free_device(dev, bridgeRadius); - cms::cuda::free_device(dev, rzChiSquared); - cms::cuda::free_device(dev, chiSquared); - cms::cuda::free_device(dev, nonAnchorChiSquared); -} - -void SDL::quintuplets::freeMemory(cudaStream_t stream) -{ - cudaFree(tripletIndices); - cudaFree(lowerModuleIndices); - cudaFree(nQuintuplets); - cudaFree(totOccupancyQuintuplets); - cudaFree(innerRadius); - cudaFree(outerRadius); - cudaFree(regressionRadius); - cudaFree(partOfPT5); - cudaFree(isDup); - cudaFree(TightCutFlag); - cudaFree(pt); - cudaFree(layer); - cudaFree(regressionG); - cudaFree(regressionF); - cudaFree(logicalLayers); - cudaFree(hitIndices); - cudaFree(nMemoryLocations); - cudaFree(bridgeRadius); - cudaFree(rzChiSquared); - cudaFree(chiSquared); - cudaFree(nonAnchorChiSquared); - cudaStreamSynchronize(stream); -} - -void SDL::createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& nTotalQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - quintupletsInGPU.tripletIndices = (unsigned int*)cms::cuda::allocate_device(dev, 2 * nTotalQuintuplets * sizeof(unsigned int), stream); - quintupletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, 5 * nTotalQuintuplets * sizeof(uint16_t), stream); - quintupletsInGPU.nQuintuplets = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream); - quintupletsInGPU.totOccupancyQuintuplets = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream); - quintupletsInGPU.innerRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(FPX), stream); - quintupletsInGPU.outerRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(FPX), stream); - quintupletsInGPU.bridgeRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - - quintupletsInGPU.pt = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets *4* sizeof(FPX), stream); - quintupletsInGPU.layer = (uint8_t*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(uint8_t), stream); - quintupletsInGPU.isDup = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream); - quintupletsInGPU.TightCutFlag = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream); - quintupletsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream); - quintupletsInGPU.regressionRadius = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - quintupletsInGPU.regressionG = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - quintupletsInGPU.regressionF = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - quintupletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(uint8_t) * 5, stream); - quintupletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(unsigned int) * 10, stream); - quintupletsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); - - quintupletsInGPU.rzChiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - quintupletsInGPU.chiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); - quintupletsInGPU.nonAnchorChiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream); -#else - cudaMalloc(&quintupletsInGPU.tripletIndices, 2 * nTotalQuintuplets * sizeof(unsigned int)); - cudaMalloc(&quintupletsInGPU.lowerModuleIndices, 5 * nTotalQuintuplets * sizeof(uint16_t)); - cudaMalloc(&quintupletsInGPU.nQuintuplets, nLowerModules * sizeof(int)); - cudaMalloc(&quintupletsInGPU.totOccupancyQuintuplets, nLowerModules * sizeof(int)); - cudaMalloc(&quintupletsInGPU.innerRadius, nTotalQuintuplets * sizeof(FPX)); - cudaMalloc(&quintupletsInGPU.outerRadius, nTotalQuintuplets * sizeof(FPX)); - cudaMalloc(&quintupletsInGPU.pt, nTotalQuintuplets *4* sizeof(FPX)); - cudaMalloc(&quintupletsInGPU.isDup, nTotalQuintuplets * sizeof(bool)); - cudaMalloc(&quintupletsInGPU.TightCutFlag, nTotalQuintuplets * sizeof(bool)); - cudaMalloc(&quintupletsInGPU.partOfPT5, nTotalQuintuplets * sizeof(bool)); - cudaMalloc(&quintupletsInGPU.layer, nTotalQuintuplets * sizeof(uint8_t)); - cudaMalloc(&quintupletsInGPU.regressionRadius, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.regressionG, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.regressionF, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.logicalLayers, nTotalQuintuplets * 5 * sizeof(uint8_t)); - cudaMalloc(&quintupletsInGPU.hitIndices, nTotalQuintuplets * 10 * sizeof(unsigned int)); - cudaMalloc(&quintupletsInGPU.nMemoryLocations, sizeof(unsigned int)); - cudaMalloc(&quintupletsInGPU.bridgeRadius, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.rzChiSquared, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.chiSquared, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.nonAnchorChiSquared, nTotalQuintuplets * sizeof(float)); - cudaMalloc(&quintupletsInGPU.nMemoryLocations, sizeof(unsigned int)); -#endif - cudaMemsetAsync(quintupletsInGPU.nQuintuplets,0,nLowerModules * sizeof(int),stream); - cudaMemsetAsync(quintupletsInGPU.totOccupancyQuintuplets,0,nLowerModules * sizeof(int),stream); - cudaMemsetAsync(quintupletsInGPU.isDup,0,nTotalQuintuplets * sizeof(bool),stream); - cudaMemsetAsync(quintupletsInGPU.TightCutFlag,0,nTotalQuintuplets * sizeof(bool),stream); - cudaMemsetAsync(quintupletsInGPU.partOfPT5,0,nTotalQuintuplets * sizeof(bool),stream); - cudaStreamSynchronize(stream); - quintupletsInGPU.eta = quintupletsInGPU.pt + nTotalQuintuplets; - quintupletsInGPU.phi = quintupletsInGPU.pt + 2*nTotalQuintuplets; - quintupletsInGPU.score_rphisum = quintupletsInGPU.pt + 3*nTotalQuintuplets; -} \ No newline at end of file diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index 0f408a30..3a8aa7e6 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -41,13 +41,105 @@ namespace SDL float* chiSquared; float* nonAnchorChiSquared; - quintuplets(); - ~quintuplets(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + void setData(TBuff& quintupletsbuf) + { + tripletIndices = alpaka::getPtrNative(quintupletsbuf.tripletIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(quintupletsbuf.lowerModuleIndices_buf); + nQuintuplets = alpaka::getPtrNative(quintupletsbuf.nQuintuplets_buf); + totOccupancyQuintuplets = alpaka::getPtrNative(quintupletsbuf.totOccupancyQuintuplets_buf); + nMemoryLocations = alpaka::getPtrNative(quintupletsbuf.nMemoryLocations_buf); + innerRadius = alpaka::getPtrNative(quintupletsbuf.innerRadius_buf); + bridgeRadius = alpaka::getPtrNative(quintupletsbuf.bridgeRadius_buf); + outerRadius = alpaka::getPtrNative(quintupletsbuf.outerRadius_buf); + pt = alpaka::getPtrNative(quintupletsbuf.pt_buf); + eta = alpaka::getPtrNative(quintupletsbuf.eta_buf); + phi = alpaka::getPtrNative(quintupletsbuf.phi_buf); + score_rphisum = alpaka::getPtrNative(quintupletsbuf.score_rphisum_buf); + layer = alpaka::getPtrNative(quintupletsbuf.layer_buf); + isDup = alpaka::getPtrNative(quintupletsbuf.isDup_buf); + TightCutFlag = alpaka::getPtrNative(quintupletsbuf.TightCutFlag_buf); + partOfPT5 = alpaka::getPtrNative(quintupletsbuf.partOfPT5_buf); + regressionRadius = alpaka::getPtrNative(quintupletsbuf.regressionRadius_buf); + regressionG = alpaka::getPtrNative(quintupletsbuf.regressionG_buf); + regressionF = alpaka::getPtrNative(quintupletsbuf.regressionF_buf); + logicalLayers = alpaka::getPtrNative(quintupletsbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(quintupletsbuf.hitIndices_buf); + rzChiSquared = alpaka::getPtrNative(quintupletsbuf.rzChiSquared_buf); + chiSquared = alpaka::getPtrNative(quintupletsbuf.chiSquared_buf); + nonAnchorChiSquared = alpaka::getPtrNative(quintupletsbuf.nonAnchorChiSquared_buf); + } }; - void createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& maxQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream); + template + struct quintupletsBuffer : quintuplets + { + Buf tripletIndices_buf; + Buf lowerModuleIndices_buf; + Buf nQuintuplets_buf; + Buf totOccupancyQuintuplets_buf; + Buf nMemoryLocations_buf; + + Buf innerRadius_buf; + Buf bridgeRadius_buf; + Buf outerRadius_buf; + Buf pt_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_rphisum_buf; + Buf layer_buf; + Buf isDup_buf; + Buf TightCutFlag_buf; + Buf partOfPT5_buf; + + Buf regressionRadius_buf; + Buf regressionG_buf; + Buf regressionF_buf; + + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf rzChiSquared_buf; + Buf chiSquared_buf; + Buf nonAnchorChiSquared_buf; + + template + quintupletsBuffer(unsigned int nTotalQuintuplets, + unsigned int nLowerModules, + TDevAcc const & devAccIn, + TQueue& queue) : + tripletIndices_buf(allocBufWrapper(devAccIn, 2 * nTotalQuintuplets)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets)), + nQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules)), + totOccupancyQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), + innerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + bridgeRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + outerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + pt_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + eta_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + phi_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + score_rphisum_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + layer_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + TightCutFlag_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + partOfPT5_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + regressionRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + regressionG_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + regressionF_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + logicalLayers_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets)), + hitIndices_buf(allocBufWrapper(devAccIn, 10 * nTotalQuintuplets)), + rzChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + chiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), + nonAnchorChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)) + { + alpaka::memset(queue, nQuintuplets_buf, 0, nLowerModules); + alpaka::memset(queue, totOccupancyQuintuplets_buf, 0, nLowerModules); + alpaka::memset(queue, isDup_buf, 0, nTotalQuintuplets); + alpaka::memset(queue, TightCutFlag_buf, 0, nTotalQuintuplets); + alpaka::memset(queue, partOfPT5_buf, 0, nTotalQuintuplets); + alpaka::wait(queue); + } + }; ALPAKA_FN_ACC ALPAKA_FN_INLINE bool checkIntervalOverlap(const float& firstMin, const float& firstMax, const float& secondMin, const float& secondMax) { diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 763200ea..5e95ed21 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -162,7 +162,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getT3sFromT5(SDL::Event* event, unsigned int T5) { - SDL::quintuplets& quintuplets_ = *(event->getQuintuplets()); + SDL::quintupletsBuffer& quintuplets_ = *(event->getQuintuplets()); unsigned int T3_1 = quintuplets_.tripletIndices[2 * T5]; unsigned int T3_2 = quintuplets_.tripletIndices[2 * T5 + 1]; return {T3_1, T3_2}; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index e1284b34..abadd16b 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -306,7 +306,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) { // ============ pT5 ============= SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); - SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); + SDL::quintupletsBuffer& quintupletsInGPU = (*event->getQuintuplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::modules& modulesInGPU = (*event->getModules()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -391,7 +391,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) //________________________________________________________________________________________________________________________________ void setQuintupletOutputBranches(SDL::Event* event) { - SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets()); + SDL::quintupletsBuffer& quintupletsInGPU = (*event->getQuintuplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const float kRinv1GeVf = (2.99792458e-3 * 3.8); @@ -409,7 +409,7 @@ void setQuintupletOutputBranches(SDL::Event* event) float pt = quintupletsInGPU.innerRadius[quintupletIndex] * kRinv1GeVf; float eta = __H2F(quintupletsInGPU.eta[quintupletIndex]); float phi = __H2F(quintupletsInGPU.phi[quintupletIndex]); - + std::vector hit_idx = getHitIdxsFromT5(event, quintupletIndex); std::vector hit_type = getHitTypesFromT5(event, quintupletIndex); std::vector module_idx = getModuleIdxsFromT5(event, quintupletIndex); From dbadfd9a79beafef0c6fbc1ba0e469caaa45f540 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 12:51:51 -0400 Subject: [PATCH 22/44] move trackcans to Alpaka memory --- SDL/Event.cu | 95 +++++++++++--------------- SDL/Event.cuh | 7 +- SDL/LST.cc | 2 +- SDL/TrackCandidate.cu | 123 ---------------------------------- SDL/TrackCandidate.cuh | 79 ++++++++++++++++++++-- code/core/AccessHelper.cc | 4 +- code/core/write_sdl_ntuple.cc | 16 ++--- 7 files changed, 128 insertions(+), 198 deletions(-) delete mode 100644 SDL/TrackCandidate.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index 65774128..5a325624 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -62,18 +62,16 @@ SDL::Event::~Event() if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} - if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} - if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);} #endif if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;} - if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);} + if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;} if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);} @@ -141,11 +139,6 @@ SDL::Event::~Event() if(trackCandidatesInCPU != nullptr) { - delete[] trackCandidatesInCPU->objectIndices; - delete[] trackCandidatesInCPU->trackCandidateType; - delete[] trackCandidatesInCPU->nTrackCandidates; - delete[] trackCandidatesInCPU->hitIndices; - delete[] trackCandidatesInCPU->logicalLayers; delete trackCandidatesInCPU; } @@ -198,12 +191,10 @@ void SDL::Event::resetEvent() if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} - if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();} #else if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} - if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);} #endif //reset the arrays for(int i = 0; i<6; i++) @@ -236,7 +227,7 @@ void SDL::Event::resetEvent() tripletsInGPU = nullptr;} if(quintupletsInGPU){delete quintupletsInGPU; delete quintupletsBuffers; quintupletsInGPU = nullptr;} - if(trackCandidatesInGPU){cms::cuda::free_host(trackCandidatesInGPU); + if(trackCandidatesInGPU){delete trackCandidatesInGPU; delete trackCandidatesBuffers; trackCandidatesInGPU = nullptr;} if(pixelTripletsInGPU){cms::cuda::free_host(pixelTripletsInGPU); pixelTripletsInGPU = nullptr;} @@ -306,12 +297,6 @@ void SDL::Event::resetEvent() } if(trackCandidatesInCPU != nullptr) { - delete[] trackCandidatesInCPU->objectIndices; - delete[] trackCandidatesInCPU->trackCandidateType; - delete[] trackCandidatesInCPU->nTrackCandidates; - delete[] trackCandidatesInCPU->logicalLayers; - delete[] trackCandidatesInCPU->hitIndices; - delete[] trackCandidatesInCPU->lowerModuleIndices; delete trackCandidatesInCPU; trackCandidatesInCPU = nullptr; } @@ -796,7 +781,6 @@ void SDL::Event::createSegmentsWithModuleMap() } } - void SDL::Event::createTriplets() { if(tripletsInGPU == nullptr) @@ -908,8 +892,9 @@ void SDL::Event::createTrackCandidates() cudaMemcpyAsync(&nEligibleModules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); if(trackCandidatesInGPU == nullptr) { - trackCandidatesInGPU = (SDL::trackCandidates*)cms::cuda::allocate_host(sizeof(SDL::trackCandidates), stream); - createTrackCandidatesInExplicitMemory(*trackCandidatesInGPU, N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES,stream); + trackCandidatesInGPU = new SDL::trackCandidates(); + trackCandidatesBuffers = new SDL::trackCandidatesBuffer(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue); + trackCandidatesInGPU->setData(*trackCandidatesBuffers); } Vec const threadsPerBlock_crossCleanpT3(static_cast(1), static_cast(16), static_cast(64)); @@ -1288,8 +1273,9 @@ void SDL::Event::createPixelQuintuplets() } if(trackCandidatesInGPU == nullptr) { - trackCandidatesInGPU = (SDL::trackCandidates*)cms::cuda::allocate_host(sizeof(SDL::trackCandidates), stream); - createTrackCandidatesInExplicitMemory(*trackCandidatesInGPU, N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES,stream); + trackCandidatesInGPU = new SDL::trackCandidates(); + trackCandidatesBuffers = new SDL::trackCandidatesBuffer(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue); + trackCandidatesInGPU->setData(*trackCandidatesBuffers); } unsigned int pixelModuleIndex; @@ -1997,52 +1983,49 @@ SDL::pixelQuintuplets* SDL::Event::getPixelQuintuplets() return pixelQuintupletsInCPU; } -SDL::trackCandidates* SDL::Event::getTrackCandidates() +SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidates() { if(trackCandidatesInCPU == nullptr) { - trackCandidatesInCPU = new SDL::trackCandidates; - trackCandidatesInCPU->nTrackCandidates = new int; - cudaMemcpyAsync(trackCandidatesInCPU->nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - int nTrackCandidates = *(trackCandidatesInCPU->nTrackCandidates); - - trackCandidatesInCPU->directObjectIndices = new unsigned int[nTrackCandidates]; - trackCandidatesInCPU->objectIndices = new unsigned int[2 * nTrackCandidates]; - trackCandidatesInCPU->trackCandidateType = new short[nTrackCandidates]; - trackCandidatesInCPU->hitIndices = new unsigned int[14 * nTrackCandidates]; - trackCandidatesInCPU->pixelSeedIndex = new int[nTrackCandidates]; - trackCandidatesInCPU->logicalLayers = new uint8_t[7 * nTrackCandidates]; - - cudaMemcpyAsync(trackCandidatesInCPU->hitIndices, trackCandidatesInGPU->hitIndices, 14 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(trackCandidatesInCPU->pixelSeedIndex, trackCandidatesInGPU->pixelSeedIndex, nTrackCandidates * sizeof(int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(trackCandidatesInCPU->logicalLayers, trackCandidatesInGPU->logicalLayers, 7 * nTrackCandidates * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(trackCandidatesInCPU->directObjectIndices, trackCandidatesInGPU->directObjectIndices, nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(trackCandidatesInCPU->objectIndices, trackCandidatesInGPU->objectIndices, 2 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(trackCandidatesInCPU->trackCandidateType, trackCandidatesInGPU->trackCandidateType, nTrackCandidates * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU + auto nTrackLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); + alpaka::wait(queue); + + int nTrackLocal = *alpaka::getPtrNative(nTrackLocal_buf); + trackCandidatesInCPU = new SDL::trackCandidatesBuffer(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue); + trackCandidatesInCPU->setData(*trackCandidatesInCPU); + + *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackLocal; + alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->logicalLayers_buf, trackCandidatesBuffers->logicalLayers_buf, 7 * nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->directObjectIndices_buf, trackCandidatesBuffers->directObjectIndices_buf, nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->objectIndices_buf, trackCandidatesBuffers->objectIndices_buf, 2 * nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackLocal); + alpaka::wait(queue); } return trackCandidatesInCPU; } -SDL::trackCandidates* SDL::Event::getTrackCandidatesInCMSSW() +SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidatesInCMSSW() { if(trackCandidatesInCPU == nullptr) { - trackCandidatesInCPU = new SDL::trackCandidates; - trackCandidatesInCPU->nTrackCandidates = new int; - cudaMemcpyAsync(trackCandidatesInCPU->nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - unsigned int nTrackCandidates = *(trackCandidatesInCPU->nTrackCandidates); + // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU + auto nTrackLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); + alpaka::wait(queue); - trackCandidatesInCPU->trackCandidateType = new short[nTrackCandidates]; - trackCandidatesInCPU->hitIndices = new unsigned int[14 * nTrackCandidates]; - trackCandidatesInCPU->pixelSeedIndex = new int[nTrackCandidates]; + int nTrackLocal = *alpaka::getPtrNative(nTrackLocal_buf); + trackCandidatesInCPU = new SDL::trackCandidatesBuffer(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue); + trackCandidatesInCPU->setData(*trackCandidatesInCPU); - cudaMemcpyAsync(trackCandidatesInCPU->hitIndices, trackCandidatesInGPU->hitIndices, 14 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(trackCandidatesInCPU->pixelSeedIndex, trackCandidatesInGPU->pixelSeedIndex, nTrackCandidates * sizeof(int), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(trackCandidatesInCPU->trackCandidateType, trackCandidatesInGPU->trackCandidateType, nTrackCandidates * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackLocal; + alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackLocal); + alpaka::memcpy(queue, trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackLocal); + alpaka::wait(queue); } return trackCandidatesInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index db173f3e..bbdd93f1 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -51,6 +51,7 @@ namespace SDL struct quintuplets* quintupletsInGPU; struct quintupletsBuffer* quintupletsBuffers; struct trackCandidates* trackCandidatesInGPU; + struct trackCandidatesBuffer* trackCandidatesBuffers; struct pixelTriplets* pixelTripletsInGPU; struct pixelQuintuplets* pixelQuintupletsInGPU; @@ -60,7 +61,7 @@ namespace SDL miniDoublets* mdsInCPU; segmentsBuffer* segmentsInCPU; tripletsBuffer* tripletsInCPU; - trackCandidates* trackCandidatesInCPU; + trackCandidatesBuffer* trackCandidatesInCPU; modules* modulesInCPU; modules* modulesInCPUFull; quintupletsBuffer* quintupletsInCPU; @@ -141,8 +142,8 @@ namespace SDL segmentsBuffer* getSegments() ; tripletsBuffer* getTriplets(); quintupletsBuffer* getQuintuplets(); - trackCandidates* getTrackCandidates(); - trackCandidates* getTrackCandidatesInCMSSW(); + trackCandidatesBuffer* getTrackCandidates(); + trackCandidatesBuffer* getTrackCandidatesInCMSSW(); pixelTriplets* getPixelTriplets(); modules* getModules(); modules* getFullModules(); diff --git a/SDL/LST.cc b/SDL/LST.cc index a46a6167..9f9930c3 100644 --- a/SDL/LST.cc +++ b/SDL/LST.cc @@ -404,7 +404,7 @@ void SDL::LST::getOutput(SDL::Event& event) { std::vector tc_trackCandidateType_; SDL::hitsBuffer& hitsInGPU = (*event.getHitsInCMSSW()); - SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW()); unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates; for (unsigned int idx = 0; idx < nTrackCandidates; idx++) { diff --git a/SDL/TrackCandidate.cu b/SDL/TrackCandidate.cu deleted file mode 100644 index 7853de30..00000000 --- a/SDL/TrackCandidate.cu +++ /dev/null @@ -1,123 +0,0 @@ -#include "TrackCandidate.cuh" - -void SDL::createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - trackCandidatesInGPU.trackCandidateType = (short*)cms::cuda::allocate_device(dev,maxTrackCandidates * sizeof(short),stream); - trackCandidatesInGPU.directObjectIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTrackCandidates * sizeof(unsigned int),stream); - trackCandidatesInGPU.objectIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTrackCandidates * 2*sizeof(unsigned int),stream); - trackCandidatesInGPU.nTrackCandidates= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream); - trackCandidatesInGPU.nTrackCandidatespT3= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream); - trackCandidatesInGPU.nTrackCandidatesT5= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream); - trackCandidatesInGPU.nTrackCandidatespT5= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream); - trackCandidatesInGPU.nTrackCandidatespLS= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream); - - trackCandidatesInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, 7 * maxTrackCandidates * sizeof(uint8_t), stream); - trackCandidatesInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, 7 * maxTrackCandidates * sizeof(uint16_t), stream); - trackCandidatesInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, 14 * maxTrackCandidates * sizeof(unsigned int), stream); - trackCandidatesInGPU.pixelSeedIndex = (int*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(int), stream); - trackCandidatesInGPU.centerX = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream); - trackCandidatesInGPU.centerY = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream); - trackCandidatesInGPU.radius = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream); - -#else - cudaMalloc(&trackCandidatesInGPU.trackCandidateType, maxTrackCandidates * sizeof(short)); - cudaMalloc(&trackCandidatesInGPU.directObjectIndices, maxTrackCandidates * sizeof(unsigned int)); - cudaMalloc(&trackCandidatesInGPU.objectIndices, 2 * maxTrackCandidates * sizeof(unsigned int)); - cudaMalloc(&trackCandidatesInGPU.nTrackCandidates, sizeof(int)); - cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespT3, sizeof(int)); - cudaMalloc(&trackCandidatesInGPU.nTrackCandidatesT5, sizeof(int)); - cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespT5, sizeof(int)); - cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespLS, sizeof(int)); - - cudaMalloc(&trackCandidatesInGPU.logicalLayers, 7 * maxTrackCandidates * sizeof(uint8_t)); - cudaMalloc(&trackCandidatesInGPU.lowerModuleIndices, 7 * maxTrackCandidates * sizeof(uint16_t)); - cudaMalloc(&trackCandidatesInGPU.hitIndices, 14 * maxTrackCandidates * sizeof(unsigned int)); - cudaMalloc(&trackCandidatesInGPU.pixelSeedIndex, maxTrackCandidates * sizeof(int)); - cudaMalloc(&trackCandidatesInGPU.centerX, maxTrackCandidates * sizeof(FPX)); - cudaMalloc(&trackCandidatesInGPU.centerY, maxTrackCandidates * sizeof(FPX)); - cudaMalloc(&trackCandidatesInGPU.radius , maxTrackCandidates * sizeof(FPX)); -#endif - cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidates,0, sizeof(int), stream); - cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatesT5,0, sizeof(int), stream); - cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespT3,0, sizeof(int), stream); - cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespT5,0, sizeof(int), stream); - cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespLS,0, sizeof(int), stream); - cudaMemsetAsync(trackCandidatesInGPU.logicalLayers, 0, 7 * maxTrackCandidates * sizeof(uint8_t), stream); - cudaMemsetAsync(trackCandidatesInGPU.lowerModuleIndices, 0, 7 * maxTrackCandidates * sizeof(uint16_t), stream); - cudaMemsetAsync(trackCandidatesInGPU.hitIndices, 0, 14 * maxTrackCandidates * sizeof(unsigned int), stream); - cudaMemsetAsync(trackCandidatesInGPU.pixelSeedIndex, 0, maxTrackCandidates * sizeof(int), stream); - cudaStreamSynchronize(stream); -} - -SDL::trackCandidates::trackCandidates() -{ - trackCandidateType = nullptr; - directObjectIndices = nullptr; - objectIndices = nullptr; - nTrackCandidates = nullptr; - nTrackCandidatesT5 = nullptr; - nTrackCandidatespT3 = nullptr; - nTrackCandidatespT5 = nullptr; - nTrackCandidatespLS = nullptr; - - logicalLayers = nullptr; - hitIndices = nullptr; - pixelSeedIndex = nullptr; - lowerModuleIndices = nullptr; - centerX = nullptr; - centerY = nullptr; - radius = nullptr; -} - -SDL::trackCandidates::~trackCandidates() -{ -} - -void SDL::trackCandidates::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - //FIXME - //cudaFree(trackCandidateType); - cms::cuda::free_device(dev,directObjectIndices); - cms::cuda::free_device(dev,objectIndices); - cms::cuda::free_device(dev,trackCandidateType); - cms::cuda::free_device(dev,nTrackCandidates); - cms::cuda::free_device(dev,nTrackCandidatespT3); - cms::cuda::free_device(dev,nTrackCandidatesT5); - cms::cuda::free_device(dev,nTrackCandidatespT5); - cms::cuda::free_device(dev,nTrackCandidatespLS); - - cms::cuda::free_device(dev, logicalLayers); - cms::cuda::free_device(dev, hitIndices); - cms::cuda::free_device(dev, pixelSeedIndex); - cms::cuda::free_device(dev, lowerModuleIndices); - cms::cuda::free_device(dev, centerX); - cms::cuda::free_device(dev, centerY); - cms::cuda::free_device(dev, radius); -} - -void SDL::trackCandidates::freeMemory(cudaStream_t stream) -{ - cudaFree(trackCandidateType); - cudaFree(directObjectIndices); - cudaFree(objectIndices); - cudaFree(nTrackCandidates); - cudaFree(nTrackCandidatespT3); - cudaFree(nTrackCandidatesT5); - cudaFree(nTrackCandidatespT5); - cudaFree(nTrackCandidatespLS); - - cudaFree(logicalLayers); - cudaFree(hitIndices); - cudaFree(pixelSeedIndex); - cudaFree(lowerModuleIndices); - cudaFree(centerX); - cudaFree(centerY); - cudaFree(radius); - - cudaStreamSynchronize(stream); -} diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index d81a570d..c11ae247 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -32,13 +32,82 @@ namespace SDL FPX* centerY; FPX* radius; - trackCandidates(); - ~trackCandidates(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + void setData(TBuff& trackCandidatesbuf) + { + trackCandidateType = alpaka::getPtrNative(trackCandidatesbuf.trackCandidateType_buf); + directObjectIndices = alpaka::getPtrNative(trackCandidatesbuf.directObjectIndices_buf); + objectIndices = alpaka::getPtrNative(trackCandidatesbuf.objectIndices_buf); + nTrackCandidates = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidates_buf); + nTrackCandidatespT3 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT3_buf); + nTrackCandidatespT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT5_buf); + nTrackCandidatespLS = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespLS_buf); + nTrackCandidatesT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatesT5_buf); + + logicalLayers = alpaka::getPtrNative(trackCandidatesbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(trackCandidatesbuf.hitIndices_buf); + pixelSeedIndex = alpaka::getPtrNative(trackCandidatesbuf.pixelSeedIndex_buf); + lowerModuleIndices = alpaka::getPtrNative(trackCandidatesbuf.lowerModuleIndices_buf); + + centerX = alpaka::getPtrNative(trackCandidatesbuf.centerX_buf); + centerY = alpaka::getPtrNative(trackCandidatesbuf.centerY_buf); + radius = alpaka::getPtrNative(trackCandidatesbuf.radius_buf); + } }; - void createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream); + template + struct trackCandidatesBuffer : trackCandidates + { + Buf trackCandidateType_buf; + Buf directObjectIndices_buf; + Buf objectIndices_buf; + Buf nTrackCandidates_buf; + Buf nTrackCandidatespT3_buf; + Buf nTrackCandidatespT5_buf; + Buf nTrackCandidatespLS_buf; + Buf nTrackCandidatesT5_buf; + + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf pixelSeedIndex_buf; + Buf lowerModuleIndices_buf; + + Buf centerX_buf; + Buf centerY_buf; + Buf radius_buf; + + template + trackCandidatesBuffer(unsigned int maxTrackCandidates, + TDevAcc const & devAccIn, + TQueue& queue) : + trackCandidateType_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + directObjectIndices_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + objectIndices_buf(allocBufWrapper(devAccIn, 2 * maxTrackCandidates)), + nTrackCandidates_buf(allocBufWrapper(devAccIn, 1)), + nTrackCandidatespT3_buf(allocBufWrapper(devAccIn, 1)), + nTrackCandidatespT5_buf(allocBufWrapper(devAccIn, 1)), + nTrackCandidatespLS_buf(allocBufWrapper(devAccIn, 1)), + nTrackCandidatesT5_buf(allocBufWrapper(devAccIn, 1)), + logicalLayers_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates)), + hitIndices_buf(allocBufWrapper(devAccIn, 14 * maxTrackCandidates)), + pixelSeedIndex_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates)), + centerX_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + centerY_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + radius_buf(allocBufWrapper(devAccIn, maxTrackCandidates)) + { + alpaka::memset(queue, nTrackCandidates_buf, 0, 1); + alpaka::memset(queue, nTrackCandidatesT5_buf, 0, 1); + alpaka::memset(queue, nTrackCandidatespT3_buf, 0, 1); + alpaka::memset(queue, nTrackCandidatespT5_buf, 0, 1); + alpaka::memset(queue, nTrackCandidatespLS_buf, 0, 1); + alpaka::memset(queue, logicalLayers_buf, 0, 7 * maxTrackCandidates); + alpaka::memset(queue, lowerModuleIndices_buf, 0, 7 * maxTrackCandidates); + alpaka::memset(queue, hitIndices_buf, 0, 14 * maxTrackCandidates); + alpaka::memset(queue, pixelSeedIndex_buf, 0, maxTrackCandidates); + alpaka::wait(queue); + } + }; ALPAKA_FN_ACC ALPAKA_FN_INLINE void addpLSTrackCandidateToMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int trackletIndex, unsigned int trackCandidateIndex, uint4 hitIndices, int pixelSeedIndex) { diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 5e95ed21..bf6025db 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -452,7 +452,7 @@ std::tuple, std::vector> getHitIdxsAndHi std::vector getLSsFromTC(SDL::Event* event, unsigned int TC) { // Get the type of the track candidate - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); short type = trackCandidatesInGPU.trackCandidateType[TC]; unsigned int objidx = trackCandidatesInGPU.directObjectIndices[TC]; switch (type) @@ -468,7 +468,7 @@ std::vector getLSsFromTC(SDL::Event* event, unsigned int TC) std::tuple, std::vector> getHitIdxsAndHitTypesFromTC(SDL::Event* event, unsigned TC) { // Get the type of the track candidate - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); short type = trackCandidatesInGPU.trackCandidateType[TC]; unsigned int objidx = trackCandidatesInGPU.directObjectIndices[TC]; switch (type) diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index abadd16b..78abd3d2 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -232,7 +232,7 @@ void setOutputBranches(SDL::Event* event) std::vector> tc_matched_simIdx; // ============ Track candidates ============= - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates; for (unsigned int idx = 0; idx < nTrackCandidates; idx++) { @@ -564,7 +564,7 @@ void setGnnNtupleBranches(SDL::Event* event) SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); std::set mds_used_in_sg; std::map md_index_map; @@ -785,7 +785,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD) std::tuple> parseTrackCandidate(SDL::Event* event, unsigned int idx) { // Get the type of the track candidate - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); short type = trackCandidatesInGPU.trackCandidateType[idx]; enum @@ -819,7 +819,7 @@ std::tuple> parseTrackCandidate(SDL:: std::tuple, vector> parsepT5(SDL::Event* event, unsigned int idx) { // Get relevant information - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); @@ -957,7 +957,7 @@ std::tuple, vector> pars std::tuple, vector> parsepT3(SDL::Event* event, unsigned int idx) { // Get relevant information - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); @@ -1004,7 +1004,7 @@ std::tuple, vector> pars //________________________________________________________________________________________________________________________________ std::tuple, vector> parseT5(SDL::Event* event, unsigned int idx) { - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx]; @@ -1058,7 +1058,7 @@ std::tuple, vector> pars //________________________________________________________________________________________________________________________________ std::tuple, vector> parsepLS(SDL::Event* event, unsigned int idx) { - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); // Getting pLS index @@ -1278,7 +1278,7 @@ void printT3s(SDL::Event* event) //________________________________________________________________________________________________________________________________ void debugPrintOutlierMultiplicities(SDL::Event* event) { - SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); From 337b7b7168c4d796d19a7cf315b2127418a47ed8 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 15:47:01 -0400 Subject: [PATCH 23/44] move minidoublets to Alpaka memory --- SDL/Event.cu | 79 ++++++------------- SDL/Event.cuh | 7 +- SDL/MiniDoublet.cu | 139 --------------------------------- SDL/MiniDoublet.cuh | 143 ++++++++++++++++++++++++++++++---- code/core/AccessHelper.cc | 4 +- code/core/write_sdl_ntuple.cc | 16 ++-- 6 files changed, 169 insertions(+), 219 deletions(-) delete mode 100644 SDL/MiniDoublet.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index 5a325624..21bfc6cc 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -59,16 +59,14 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx SDL::Event::~Event() { #ifdef CACHE_ALLOC - if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} #else - if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} #endif if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} - if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);} + if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;} if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;} if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;} @@ -85,21 +83,14 @@ SDL::Event::~Event() { delete rangesInCPU; } - if(mdsInCPU != nullptr) { - delete[] mdsInCPU->anchorHitIndices; - delete[] mdsInCPU->nMDs; - delete mdsInCPU->nMemoryLocations; - delete[] mdsInCPU->totOccupancyMDs; delete mdsInCPU; } - if(segmentsInCPU != nullptr) { delete segmentsInCPU; } - if(tripletsInCPU != nullptr) { delete tripletsInCPU; @@ -108,7 +99,6 @@ SDL::Event::~Event() { delete quintupletsInCPU; } - if(pixelTripletsInCPU != nullptr) { delete[] pixelTripletsInCPU->tripletIndices; @@ -122,7 +112,6 @@ SDL::Event::~Event() delete[] pixelTripletsInCPU->rPhiChiSquaredInwards; delete pixelTripletsInCPU; } - if(pixelQuintupletsInCPU != nullptr) { delete[] pixelQuintupletsInCPU->pixelIndices; @@ -136,12 +125,10 @@ SDL::Event::~Event() delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards; delete pixelQuintupletsInCPU; } - if(trackCandidatesInCPU != nullptr) { delete trackCandidatesInCPU; } - if(modulesInCPU != nullptr) { delete[] modulesInCPU->nLowerModules; @@ -177,8 +164,6 @@ SDL::Event::~Event() delete[] modulesInCPUFull->r; delete[] modulesInCPUFull->isInverted; delete[] modulesInCPUFull->isLower; - - delete[] modulesInCPUFull->moduleType; delete[] modulesInCPUFull->moduleLayerType; delete[] modulesInCPUFull; @@ -188,11 +173,9 @@ SDL::Event::~Event() void SDL::Event::resetEvent() { #ifdef CACHE_ALLOC - if(mdsInGPU){mdsInGPU->freeMemoryCache();} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} #else - if(mdsInGPU){mdsInGPU->freeMemory(stream);} if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} #endif @@ -217,7 +200,7 @@ void SDL::Event::resetEvent() } if(hitsInGPU){delete hitsInGPU; delete hitsBuffers; hitsInGPU = nullptr;} - if(mdsInGPU){cms::cuda::free_host(mdsInGPU); + if(mdsInGPU){delete mdsInGPU; delete miniDoubletsBuffers; mdsInGPU = nullptr;} if(rangesInGPU){delete rangesInGPU; delete rangesBuffers; rangesInGPU = nullptr;} @@ -246,9 +229,6 @@ void SDL::Event::resetEvent() } if(mdsInCPU != nullptr) { - delete[] mdsInCPU->anchorHitIndices; - delete[] mdsInCPU->nMDs; - delete[] mdsInCPU->totOccupancyMDs; delete mdsInCPU; mdsInCPU = nullptr; } @@ -445,7 +425,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st if(mdsInGPU == nullptr) { - mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream); unsigned int nTotalMDs; cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream); @@ -465,9 +444,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); - nTotalMDs+= N_MAX_PIXEL_MD_PER_MODULES; + nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; - createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream); + mdsInGPU = new SDL::miniDoublets(); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue); + mdsInGPU->setData(*miniDoubletsBuffers); cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); @@ -661,11 +642,10 @@ void SDL::Event::createMiniDoublets() if(mdsInGPU == nullptr) { - mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream); - //FIXME: Add memory locations for pixel MDs - createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, stream); + mdsInGPU = new SDL::miniDoublets(); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue); + mdsInGPU->setData(*miniDoubletsBuffers); } - cudaStreamSynchronize(stream); int maxThreadsPerModule=0; int* module_hitRanges; @@ -912,7 +892,6 @@ void SDL::Event::createTrackCandidates() *pixelQuintupletsInGPU)); alpaka::enqueue(queue, crossCleanpT3Task); - alpaka::wait(queue); //adding objects Vec const threadsPerBlock_addpT3asTrackCandidatesInGPU(static_cast(1), static_cast(1), static_cast(512)); @@ -930,7 +909,6 @@ void SDL::Event::createTrackCandidates() *rangesInGPU)); alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask); - alpaka::wait(queue); Vec const threadsPerBlockRemoveDupQuints(static_cast(1), static_cast(16), static_cast(32)); Vec const blocksPerGridRemoveDupQuints(static_cast(1), static_cast(max(nEligibleModules/16,1)), static_cast(max(nEligibleModules/32,1))); @@ -944,7 +922,6 @@ void SDL::Event::createTrackCandidates() *rangesInGPU)); alpaka::enqueue(queue, removeDupQuintupletsInGPUBeforeTCTask); - alpaka::wait(queue); Vec const threadsPerBlock_crossCleanT5(static_cast(32), static_cast(1), static_cast(32)); Vec const blocksPerGrid_crossCleanT5(static_cast((13296/32) + 1), static_cast(1), static_cast(MAX_BLOCKS)); @@ -961,7 +938,6 @@ void SDL::Event::createTrackCandidates() *rangesInGPU)); alpaka::enqueue(queue, crossCleanT5Task); - alpaka::wait(queue); Vec const threadsPerBlock_addT5asTrackCandidateInGPU(static_cast(1), static_cast(8), static_cast(128)); Vec const blocksPerGrid_addT5asTrackCandidateInGPU(static_cast(1), static_cast(8), static_cast(10)); @@ -977,7 +953,6 @@ void SDL::Event::createTrackCandidates() *rangesInGPU)); alpaka::enqueue(queue, addT5asTrackCandidateInGPUTask); - alpaka::wait(queue); Vec const threadsPerBlockCheckHitspLS(static_cast(1), static_cast(16), static_cast(16)); Vec const blocksPerGridCheckHitspLS(static_cast(1), static_cast(MAX_BLOCKS*4), static_cast(MAX_BLOCKS/4)); @@ -992,7 +967,6 @@ void SDL::Event::createTrackCandidates() true)); alpaka::enqueue(queue, checkHitspLSTask); - alpaka::wait(queue); Vec const threadsPerBlock_crossCleanpLS(static_cast(1), static_cast(16), static_cast(32)); Vec const blocksPerGrid_crossCleanpLS(static_cast(1), static_cast(4), static_cast(20)); @@ -1012,7 +986,6 @@ void SDL::Event::createTrackCandidates() *quintupletsInGPU)); alpaka::enqueue(queue, crossCleanpLSTask); - alpaka::wait(queue); Vec const threadsPerBlock_addpLSasTrackCandidateInGPU(static_cast(1), static_cast(1), static_cast(384)); Vec const blocksPerGrid_addpLSasTrackCandidateInGPU(static_cast(1), static_cast(1), static_cast(MAX_BLOCKS)); @@ -1778,28 +1751,26 @@ SDL::objectRangesBuffer* SDL::Event::getRanges() return rangesInCPU; } -SDL::miniDoublets* SDL::Event::getMiniDoublets() +SDL::miniDoubletsBuffer* SDL::Event::getMiniDoublets() { if(mdsInCPU == nullptr) { - mdsInCPU = new SDL::miniDoublets; - mdsInCPU->nMDs = new int[nLowerModules+1]; + // Get nMemoryLocations parameter to initialize host based mdsInCPU + auto nMemLocal_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nMemLocal_buf, miniDoubletsBuffers->nMemoryLocations_buf, 1); + alpaka::wait(queue); - //compute memory locations - mdsInCPU->nMemoryLocations = new unsigned int; - cudaMemcpyAsync(mdsInCPU->nMemoryLocations, mdsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - mdsInCPU->totOccupancyMDs = new int[nLowerModules+1]; - - mdsInCPU->anchorHitIndices = new unsigned int[*(mdsInCPU->nMemoryLocations)]; - mdsInCPU->outerHitIndices = new unsigned int[*(mdsInCPU->nMemoryLocations)]; - mdsInCPU->dphichanges = new float[*(mdsInCPU->nMemoryLocations)]; - cudaMemcpyAsync(mdsInCPU->anchorHitIndices, mdsInGPU->anchorHitIndices, *(mdsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(mdsInCPU->outerHitIndices, mdsInGPU->outerHitIndices, *(mdsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(mdsInCPU->dphichanges, mdsInGPU->dphichanges, *(mdsInCPU->nMemoryLocations) * sizeof(float), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(mdsInCPU->nMDs, mdsInGPU->nMDs, (nLowerModules+1) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(mdsInCPU->totOccupancyMDs, mdsInGPU->totOccupancyMDs, (nLowerModules+1) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); + mdsInCPU = new SDL::miniDoubletsBuffer(nMemLocal, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devHost, queue); + mdsInCPU->setData(*mdsInCPU); + + *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemLocal; + alpaka::memcpy(queue, mdsInCPU->anchorHitIndices_buf, miniDoubletsBuffers->anchorHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, mdsInCPU->outerHitIndices_buf, miniDoubletsBuffers->outerHitIndices_buf, nMemLocal); + alpaka::memcpy(queue, mdsInCPU->dphichanges_buf, miniDoubletsBuffers->dphichanges_buf, nMemLocal); + alpaka::memcpy(queue, mdsInCPU->nMDs_buf, miniDoubletsBuffers->nMDs_buf, (nLowerModules+1)); + alpaka::memcpy(queue, mdsInCPU->totOccupancyMDs_buf, miniDoubletsBuffers->totOccupancyMDs_buf, (nLowerModules+1)); + alpaka::wait(queue); } return mdsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index bbdd93f1..31d3da3a 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -36,7 +36,7 @@ namespace SDL std::array n_quintuplets_by_layer_barrel_; std::array n_quintuplets_by_layer_endcap_; - //CUDA stuff + //Device stuff int dev; int nTotalSegments; struct objectRanges* rangesInGPU; @@ -44,6 +44,7 @@ namespace SDL struct hits* hitsInGPU; struct hitsBuffer* hitsBuffers; struct miniDoublets* mdsInGPU; + struct miniDoubletsBuffer* miniDoubletsBuffers; struct segments* segmentsInGPU; struct segmentsBuffer* segmentsBuffers; struct triplets* tripletsInGPU; @@ -58,7 +59,7 @@ namespace SDL //CPU interface stuff objectRangesBuffer* rangesInCPU; hitsBuffer* hitsInCPU; - miniDoublets* mdsInCPU; + miniDoubletsBuffer* mdsInCPU; segmentsBuffer* segmentsInCPU; tripletsBuffer* tripletsInCPU; trackCandidatesBuffer* trackCandidatesInCPU; @@ -138,7 +139,7 @@ namespace SDL objectRangesBuffer* getRanges(); hitsBuffer* getHits(); hitsBuffer* getHitsInCMSSW(); - miniDoublets* getMiniDoublets(); + miniDoubletsBuffer* getMiniDoublets(); segmentsBuffer* getSegments() ; tripletsBuffer* getTriplets(); quintupletsBuffer* getQuintuplets(); diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu deleted file mode 100644 index 3fd6d23a..00000000 --- a/SDL/MiniDoublet.cu +++ /dev/null @@ -1,139 +0,0 @@ -#include "MiniDoublet.cuh" - -//FIXME:Add memory locations for the pixel MDs here! -void SDL::createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - mdsInGPU.anchorHitIndices = (unsigned int*)cms::cuda::allocate_device(dev,nMemoryLocations * 2 * sizeof(unsigned int), stream); - mdsInGPU.moduleIndices = (uint16_t*)cms::cuda::allocate_device(dev, nMemoryLocations * sizeof(uint16_t), stream); - mdsInGPU.dphichanges = (float*)cms::cuda::allocate_device(dev,nMemoryLocations*9*sizeof(float),stream); - mdsInGPU.nMDs = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream); - mdsInGPU.totOccupancyMDs = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream); - mdsInGPU.anchorX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 6 * sizeof(float), stream); - mdsInGPU.anchorHighEdgeX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 4 * sizeof(float), stream); - mdsInGPU.outerX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 6 * sizeof(float), stream); - mdsInGPU.outerHighEdgeX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 4 * sizeof(float), stream); - mdsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream); -#else - cudaMalloc(&mdsInGPU.anchorHitIndices, nMemoryLocations * 2 * sizeof(unsigned int)); - cudaMalloc(&mdsInGPU.moduleIndices, nMemoryLocations * sizeof(uint16_t)); - cudaMalloc(&mdsInGPU.dphichanges, nMemoryLocations *9* sizeof(float)); - cudaMalloc(&mdsInGPU.nMDs, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&mdsInGPU.totOccupancyMDs, (nLowerModules + 1) * sizeof(int)); - cudaMalloc(&mdsInGPU.anchorX, nMemoryLocations * 6 * sizeof(float)); - cudaMalloc(&mdsInGPU.anchorHighEdgeX, nMemoryLocations * 4 * sizeof(float)); - cudaMalloc(&mdsInGPU.outerX, nMemoryLocations * 6 * sizeof(float)); - cudaMalloc(&mdsInGPU.outerHighEdgeX, nMemoryLocations * 4 * sizeof(float)); - cudaMalloc(&mdsInGPU.nMemoryLocations, sizeof(unsigned int)); -#endif - cudaMemsetAsync(mdsInGPU.nMDs,0, (nLowerModules + 1) *sizeof(int),stream); - cudaMemsetAsync(mdsInGPU.totOccupancyMDs,0, (nLowerModules + 1) *sizeof(int),stream); - cudaStreamSynchronize(stream); - - mdsInGPU.outerHitIndices = mdsInGPU.anchorHitIndices + nMemoryLocations; - mdsInGPU.dzs = mdsInGPU.dphichanges + nMemoryLocations; - mdsInGPU.dphis = mdsInGPU.dphichanges + 2*nMemoryLocations; - mdsInGPU.shiftedXs = mdsInGPU.dphichanges + 3*nMemoryLocations; - mdsInGPU.shiftedYs = mdsInGPU.dphichanges + 4*nMemoryLocations; - mdsInGPU.shiftedZs = mdsInGPU.dphichanges + 5*nMemoryLocations; - mdsInGPU.noShiftedDzs = mdsInGPU.dphichanges + 6*nMemoryLocations; - mdsInGPU.noShiftedDphis = mdsInGPU.dphichanges + 7*nMemoryLocations; - mdsInGPU.noShiftedDphiChanges = mdsInGPU.dphichanges + 8*nMemoryLocations; - - mdsInGPU.anchorY = mdsInGPU.anchorX + nMemoryLocations; - mdsInGPU.anchorZ = mdsInGPU.anchorX + 2 * nMemoryLocations; - mdsInGPU.anchorRt = mdsInGPU.anchorX + 3 * nMemoryLocations; - mdsInGPU.anchorPhi = mdsInGPU.anchorX + 4 * nMemoryLocations; - mdsInGPU.anchorEta = mdsInGPU.anchorX + 5 * nMemoryLocations; - - mdsInGPU.anchorHighEdgeY = mdsInGPU.anchorHighEdgeX + nMemoryLocations; - mdsInGPU.anchorLowEdgeX = mdsInGPU.anchorHighEdgeX + 2 * nMemoryLocations; - mdsInGPU.anchorLowEdgeY = mdsInGPU.anchorHighEdgeX + 3 * nMemoryLocations; - - mdsInGPU.outerY = mdsInGPU.outerX + nMemoryLocations; - mdsInGPU.outerZ = mdsInGPU.outerX + 2 * nMemoryLocations; - mdsInGPU.outerRt = mdsInGPU.outerX + 3 * nMemoryLocations; - mdsInGPU.outerPhi = mdsInGPU.outerX + 4 * nMemoryLocations; - mdsInGPU.outerEta = mdsInGPU.outerX + 5 * nMemoryLocations; - - mdsInGPU.outerHighEdgeY = mdsInGPU.outerHighEdgeX + nMemoryLocations; - mdsInGPU.outerLowEdgeX = mdsInGPU.outerHighEdgeX + 2 * nMemoryLocations; - mdsInGPU.outerLowEdgeY = mdsInGPU.outerHighEdgeX + 3 * nMemoryLocations; -} - -SDL::miniDoublets::miniDoublets() -{ - anchorHitIndices = nullptr; - outerHitIndices = nullptr; - moduleIndices = nullptr; - nMDs = nullptr; - totOccupancyMDs = nullptr; - dphichanges = nullptr; - - dzs = nullptr; - dphis = nullptr; - - shiftedXs = nullptr; - shiftedYs = nullptr; - shiftedZs = nullptr; - noShiftedDzs = nullptr; - noShiftedDphis = nullptr; - noShiftedDphiChanges = nullptr; - - anchorX = nullptr; - anchorY = nullptr; - anchorZ = nullptr; - anchorRt = nullptr; - anchorPhi = nullptr; - anchorEta = nullptr; - anchorHighEdgeX = nullptr; - anchorHighEdgeY = nullptr; - anchorLowEdgeX = nullptr; - anchorLowEdgeY = nullptr; - outerX = nullptr; - outerY = nullptr; - outerZ = nullptr; - outerRt = nullptr; - outerPhi = nullptr; - outerEta = nullptr; - outerHighEdgeX = nullptr; - outerHighEdgeY = nullptr; - outerLowEdgeX = nullptr; - outerLowEdgeY = nullptr; -} - -SDL::miniDoublets::~miniDoublets() -{ -} - -void SDL::miniDoublets::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,anchorHitIndices); - cms::cuda::free_device(dev, moduleIndices); - cms::cuda::free_device(dev,dphichanges); - cms::cuda::free_device(dev,nMDs); - cms::cuda::free_device(dev,totOccupancyMDs); - cms::cuda::free_device(dev, anchorX); - cms::cuda::free_device(dev, anchorHighEdgeX); - cms::cuda::free_device(dev, outerX); - cms::cuda::free_device(dev, outerHighEdgeX); - cms::cuda::free_device(dev, nMemoryLocations); -} - -void SDL::miniDoublets::freeMemory(cudaStream_t stream) -{ - cudaFree(anchorHitIndices); - cudaFree(moduleIndices); - cudaFree(nMDs); - cudaFree(totOccupancyMDs); - cudaFree(dphichanges); - cudaFree(anchorX); - cudaFree(anchorHighEdgeX); - cudaFree(outerX); - cudaFree(outerHighEdgeX); - cudaFree(nMemoryLocations); -} diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 9f723e2d..7b80cb28 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -29,7 +29,6 @@ namespace SDL float* noShiftedDphis; //if shifted module float* noShiftedDphiChanges; //if shifted module - //hit stuff float* anchorX; float* anchorY; float* anchorZ; @@ -52,20 +51,138 @@ namespace SDL float* outerLowEdgeX; float* outerLowEdgeY; -#ifdef CUT_VALUE_DEBUG - //CUT VALUES - float* dzCuts; - float* drtCuts; - float* drts; - float* miniCuts; -#endif - miniDoublets(); - ~miniDoublets(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + void setData(TBuf& mdsbuf) + { + nMemoryLocations = alpaka::getPtrNative(mdsbuf.nMemoryLocations_buf); + anchorHitIndices = alpaka::getPtrNative(mdsbuf.anchorHitIndices_buf); + outerHitIndices = alpaka::getPtrNative(mdsbuf.outerHitIndices_buf); + moduleIndices = alpaka::getPtrNative(mdsbuf.moduleIndices_buf); + nMDs = alpaka::getPtrNative(mdsbuf.nMDs_buf); + totOccupancyMDs = alpaka::getPtrNative(mdsbuf.totOccupancyMDs_buf); + dphichanges = alpaka::getPtrNative(mdsbuf.dphichanges_buf); + dzs = alpaka::getPtrNative(mdsbuf.dzs_buf); + dphis = alpaka::getPtrNative(mdsbuf.dphis_buf); + shiftedXs = alpaka::getPtrNative(mdsbuf.shiftedXs_buf); + shiftedYs = alpaka::getPtrNative(mdsbuf.shiftedYs_buf); + shiftedZs = alpaka::getPtrNative(mdsbuf.shiftedZs_buf); + noShiftedDzs = alpaka::getPtrNative(mdsbuf.noShiftedDzs_buf); + noShiftedDphis = alpaka::getPtrNative(mdsbuf.noShiftedDphis_buf); + noShiftedDphiChanges = alpaka::getPtrNative(mdsbuf.noShiftedDphiChanges_buf); + anchorX = alpaka::getPtrNative(mdsbuf.anchorX_buf); + anchorY = alpaka::getPtrNative(mdsbuf.anchorY_buf); + anchorZ = alpaka::getPtrNative(mdsbuf.anchorZ_buf); + anchorRt = alpaka::getPtrNative(mdsbuf.anchorRt_buf); + anchorPhi = alpaka::getPtrNative(mdsbuf.anchorPhi_buf); + anchorEta = alpaka::getPtrNative(mdsbuf.anchorEta_buf); + anchorHighEdgeX = alpaka::getPtrNative(mdsbuf.anchorHighEdgeX_buf); + anchorHighEdgeY = alpaka::getPtrNative(mdsbuf.anchorHighEdgeY_buf); + anchorLowEdgeX = alpaka::getPtrNative(mdsbuf.anchorLowEdgeX_buf); + anchorLowEdgeY = alpaka::getPtrNative(mdsbuf.anchorLowEdgeY_buf); + outerX = alpaka::getPtrNative(mdsbuf.outerX_buf); + outerY = alpaka::getPtrNative(mdsbuf.outerY_buf); + outerZ = alpaka::getPtrNative(mdsbuf.outerZ_buf); + outerRt = alpaka::getPtrNative(mdsbuf.outerRt_buf); + outerPhi = alpaka::getPtrNative(mdsbuf.outerPhi_buf); + outerEta = alpaka::getPtrNative(mdsbuf.outerEta_buf); + outerHighEdgeX = alpaka::getPtrNative(mdsbuf.outerHighEdgeX_buf); + outerHighEdgeY = alpaka::getPtrNative(mdsbuf.outerHighEdgeY_buf); + outerLowEdgeX = alpaka::getPtrNative(mdsbuf.outerLowEdgeX_buf); + outerLowEdgeY = alpaka::getPtrNative(mdsbuf.outerLowEdgeY_buf); + } }; - void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream); + template + struct miniDoubletsBuffer : miniDoublets + { + Buf nMemoryLocations_buf; + + Buf anchorHitIndices_buf; + Buf outerHitIndices_buf; + Buf moduleIndices_buf; + Buf nMDs_buf; + Buf totOccupancyMDs_buf; + Buf dphichanges_buf; + + Buf dzs_buf; + Buf dphis_buf; + + Buf shiftedXs_buf; + Buf shiftedYs_buf; + Buf shiftedZs_buf; + Buf noShiftedDzs_buf; + Buf noShiftedDphis_buf; + Buf noShiftedDphiChanges_buf; + + Buf anchorX_buf; + Buf anchorY_buf; + Buf anchorZ_buf; + Buf anchorRt_buf; + Buf anchorPhi_buf; + Buf anchorEta_buf; + Buf anchorHighEdgeX_buf; + Buf anchorHighEdgeY_buf; + Buf anchorLowEdgeX_buf; + Buf anchorLowEdgeY_buf; + + Buf outerX_buf; + Buf outerY_buf; + Buf outerZ_buf; + Buf outerRt_buf; + Buf outerPhi_buf; + Buf outerEta_buf; + Buf outerHighEdgeX_buf; + Buf outerHighEdgeY_buf; + Buf outerLowEdgeX_buf; + Buf outerLowEdgeY_buf; + + template + miniDoubletsBuffer(unsigned int nMemoryLocations, + uint16_t nLowerModules, + unsigned int maxPixelMDs, + TDevAcc const & devAccIn, + TQueue& queue) : + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), + anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + nMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + totOccupancyMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), + dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + dzs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + dphis_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerZ_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerRt_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerEta_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), + outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), + outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), + outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)) + { + alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1); + alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1); + alpaka::wait(queue); + } + }; ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx) { diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index bf6025db..60984428 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -29,7 +29,7 @@ std::tuple, std::vector> convertHitsToHi std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS) { SDL::segmentsBuffer& segments_ = *(event->getSegments()); - SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoublets_ = *(event->getMiniDoublets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; @@ -77,7 +77,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ std::vector getHitsFromMD(SDL::Event* event, unsigned int MD) { - SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoublets_ = *(event->getMiniDoublets()); unsigned int hit_1 = miniDoublets_.anchorHitIndices[MD]; unsigned int hit_2 = miniDoublets_.outerHitIndices [MD]; return {hit_1, hit_2}; diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 78abd3d2..3a580c15 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -560,7 +560,7 @@ void setGnnNtupleBranches(SDL::Event* event) { // Get relevant information SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); @@ -716,7 +716,7 @@ void setGnnNtupleBranches(SDL::Event* event) void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD) { // Get relevant information - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); // Get the hit indices @@ -1121,7 +1121,7 @@ void printHitMultiplicities(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printMiniDoubletMultiplicities(SDL::Event* event) { - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::modules& modulesInGPU = (*event->getModules()); int nMiniDoublets = 0; @@ -1150,7 +1150,7 @@ void printAllObjects(SDL::Event* event) //________________________________________________________________________________________________________________________________ void printMDs(SDL::Event* event) { - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); @@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event) void printLSs(SDL::Event* event) { SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); @@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event) void printpLSs(SDL::Event* event) { SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); @@ -1238,7 +1238,7 @@ void printT3s(SDL::Event* event) { SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); SDL::modules& modulesInGPU = (*event->getModules()); int nTriplets = 0; @@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets()); + SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::modules& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); //int nTrackCandidates = 0; From ac2853735eba7aabc614496fd094c0e613bc6f0a Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 16:11:51 -0400 Subject: [PATCH 24/44] remove unused input to mds --- SDL/Event.cu | 6 +++--- SDL/MiniDoublet.cuh | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index 21bfc6cc..80f4637b 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -447,7 +447,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; mdsInGPU = new SDL::miniDoublets(); - miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, devAcc, queue); mdsInGPU->setData(*miniDoubletsBuffers); cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); @@ -643,7 +643,7 @@ void SDL::Event::createMiniDoublets() if(mdsInGPU == nullptr) { mdsInGPU = new SDL::miniDoublets(); - miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, devAcc, queue); mdsInGPU->setData(*miniDoubletsBuffers); } @@ -1761,7 +1761,7 @@ SDL::miniDoubletsBuffer* SDL::Event::getMiniDoublets() alpaka::wait(queue); unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf); - mdsInCPU = new SDL::miniDoubletsBuffer(nMemLocal, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devHost, queue); + mdsInCPU = new SDL::miniDoubletsBuffer(nMemLocal, nLowerModules, devHost, queue); mdsInCPU->setData(*mdsInCPU); *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemLocal; diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 7b80cb28..9a77fe58 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -139,7 +139,6 @@ namespace SDL template miniDoubletsBuffer(unsigned int nMemoryLocations, uint16_t nLowerModules, - unsigned int maxPixelMDs, TDevAcc const & devAccIn, TQueue& queue) : nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), From 15341fdc7ddd377a3a13974d67ce18acaffd4c7a Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 16:51:13 -0400 Subject: [PATCH 25/44] fix overallocation bug --- SDL/MiniDoublet.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 9a77fe58..a75dcfb0 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -172,10 +172,10 @@ namespace SDL outerRt_buf(allocBufWrapper(devAccIn, nMemoryLocations)), outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLocations)), outerEta_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), - outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), - outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)), - outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations * 4)) + outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)) { alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1); alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1); From 5bf92688e1362ec9dd4e16d4f76411bae073114f Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 8 Jun 2023 22:13:52 -0400 Subject: [PATCH 26/44] move pixels over to Alpaka memory --- SDL/Event.cu | 169 ++++++++---------------- SDL/Event.cuh | 10 +- SDL/PixelTriplet.cu | 242 ---------------------------------- SDL/PixelTriplet.cuh | 178 +++++++++++++++++++++++-- code/core/AccessHelper.cc | 6 +- code/core/write_sdl_ntuple.cc | 4 +- 6 files changed, 227 insertions(+), 382 deletions(-) delete mode 100644 SDL/PixelTriplet.cu diff --git a/SDL/Event.cu b/SDL/Event.cu index 80f4637b..c9a5871d 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -58,21 +58,14 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx SDL::Event::~Event() { -#ifdef CACHE_ALLOC - if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} - if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} -#else - if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} - if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} -#endif if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;} if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;} if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;} if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} - if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);} - if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);} + if(pixelTripletsInGPU!= nullptr){delete pixelTripletsInGPU; delete pixelTripletsBuffers;} + if(pixelQuintupletsInGPU!= nullptr){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers;} if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;} if(hitsInCPU != nullptr) @@ -101,28 +94,10 @@ SDL::Event::~Event() } if(pixelTripletsInCPU != nullptr) { - delete[] pixelTripletsInCPU->tripletIndices; - delete[] pixelTripletsInCPU->pixelSegmentIndices; - delete[] pixelTripletsInCPU->pixelRadius; - delete[] pixelTripletsInCPU->tripletRadius; - delete pixelTripletsInCPU->nPixelTriplets; - delete pixelTripletsInCPU->totOccupancyPixelTriplets; - delete[] pixelTripletsInCPU->rzChiSquared; - delete[] pixelTripletsInCPU->rPhiChiSquared; - delete[] pixelTripletsInCPU->rPhiChiSquaredInwards; delete pixelTripletsInCPU; } if(pixelQuintupletsInCPU != nullptr) { - delete[] pixelQuintupletsInCPU->pixelIndices; - delete[] pixelQuintupletsInCPU->T5Indices; - delete[] pixelQuintupletsInCPU->isDup; - delete[] pixelQuintupletsInCPU->score; - delete pixelQuintupletsInCPU->nPixelQuintuplets; - delete pixelQuintupletsInCPU->totOccupancyPixelQuintuplets; - delete[] pixelQuintupletsInCPU->rzChiSquared; - delete[] pixelQuintupletsInCPU->rPhiChiSquared; - delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards; delete pixelQuintupletsInCPU; } if(trackCandidatesInCPU != nullptr) @@ -172,13 +147,6 @@ SDL::Event::~Event() void SDL::Event::resetEvent() { -#ifdef CACHE_ALLOC - if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();} - if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();} -#else - if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);} - if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);} -#endif //reset the arrays for(int i = 0; i<6; i++) { @@ -212,9 +180,9 @@ void SDL::Event::resetEvent() quintupletsInGPU = nullptr;} if(trackCandidatesInGPU){delete trackCandidatesInGPU; delete trackCandidatesBuffers; trackCandidatesInGPU = nullptr;} - if(pixelTripletsInGPU){cms::cuda::free_host(pixelTripletsInGPU); + if(pixelTripletsInGPU){delete pixelTripletsInGPU; delete pixelTripletsBuffers; pixelTripletsInGPU = nullptr;} - if(pixelQuintupletsInGPU){cms::cuda::free_host(pixelQuintupletsInGPU); + if(pixelQuintupletsInGPU){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers; pixelQuintupletsInGPU = nullptr;} if(hitsInCPU != nullptr) @@ -249,29 +217,11 @@ void SDL::Event::resetEvent() } if(pixelTripletsInCPU != nullptr) { - delete[] pixelTripletsInCPU->tripletIndices; - delete[] pixelTripletsInCPU->pixelSegmentIndices; - delete[] pixelTripletsInCPU->pixelRadius; - delete[] pixelTripletsInCPU->tripletRadius; - delete pixelTripletsInCPU->nPixelTriplets; - delete pixelTripletsInCPU->totOccupancyPixelTriplets; - delete[] pixelTripletsInCPU->rzChiSquared; - delete[] pixelTripletsInCPU->rPhiChiSquared; - delete[] pixelTripletsInCPU->rPhiChiSquaredInwards; delete pixelTripletsInCPU; pixelTripletsInCPU = nullptr; } if(pixelQuintupletsInCPU != nullptr) { - delete[] pixelQuintupletsInCPU->pixelIndices; - delete[] pixelQuintupletsInCPU->T5Indices; - delete[] pixelQuintupletsInCPU->isDup; - delete[] pixelQuintupletsInCPU->score; - delete pixelQuintupletsInCPU->nPixelQuintuplets; - delete pixelQuintupletsInCPU->totOccupancyPixelQuintuplets; - delete[] pixelQuintupletsInCPU->rzChiSquared; - delete[] pixelQuintupletsInCPU->rPhiChiSquared; - delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards; delete pixelQuintupletsInCPU; pixelQuintupletsInCPU = nullptr; } @@ -1007,11 +957,11 @@ void SDL::Event::createPixelTriplets() { if(pixelTripletsInGPU == nullptr) { - pixelTripletsInGPU = (SDL::pixelTriplets*)cms::cuda::allocate_host(sizeof(SDL::pixelTriplets), stream); + pixelTripletsInGPU = new SDL::pixelTriplets(); + pixelTripletsBuffers = new SDL::pixelTripletsBuffer(N_MAX_PIXEL_TRIPLETS, devAcc, queue); + pixelTripletsInGPU->setData(*pixelTripletsBuffers); } - createPixelTripletsInExplicitMemory(*pixelTripletsInGPU, N_MAX_PIXEL_TRIPLETS,stream); - unsigned int pixelModuleIndex = nLowerModules; int* superbins; int8_t* pixelTypes; @@ -1241,8 +1191,9 @@ void SDL::Event::createPixelQuintuplets() { if(pixelQuintupletsInGPU == nullptr) { - pixelQuintupletsInGPU = (SDL::pixelQuintuplets*)cms::cuda::allocate_host(sizeof(SDL::pixelQuintuplets), stream); - createPixelQuintupletsInExplicitMemory(*pixelQuintupletsInGPU, N_MAX_PIXEL_QUINTUPLETS,stream); + pixelQuintupletsInGPU = new SDL::pixelQuintuplets(); + pixelQuintupletsBuffers = new SDL::pixelQuintupletsBuffer(N_MAX_PIXEL_QUINTUPLETS, devAcc, queue); + pixelQuintupletsInGPU->setData(*pixelQuintupletsBuffers); } if(trackCandidatesInGPU == nullptr) { @@ -1880,76 +1831,60 @@ SDL::quintupletsBuffer* SDL::Event::getQuintuplets() return quintupletsInCPU; } -SDL::pixelTriplets* SDL::Event::getPixelTriplets() +SDL::pixelTripletsBuffer* SDL::Event::getPixelTriplets() { if(pixelTripletsInCPU == nullptr) { - pixelTripletsInCPU = new SDL::pixelTriplets; + // Get nMemoryLocations parameter to initilize host based quintupletsInCPU + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); + alpaka::wait(queue); - pixelTripletsInCPU->nPixelTriplets = new int; - pixelTripletsInCPU->totOccupancyPixelTriplets = new int; - cudaMemcpyAsync(pixelTripletsInCPU->nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->totOccupancyPixelTriplets, pixelTripletsInGPU->totOccupancyPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - unsigned int nPixelTriplets = *(pixelTripletsInCPU->nPixelTriplets); - pixelTripletsInCPU->tripletIndices = new unsigned int[nPixelTriplets]; - pixelTripletsInCPU->pixelSegmentIndices = new unsigned int[nPixelTriplets]; - pixelTripletsInCPU->pixelRadius = new FPX[nPixelTriplets]; - pixelTripletsInCPU->tripletRadius = new FPX[nPixelTriplets]; - pixelTripletsInCPU->isDup = new bool[nPixelTriplets]; - pixelTripletsInCPU->eta = new FPX[nPixelTriplets]; - pixelTripletsInCPU->phi = new FPX[nPixelTriplets]; - pixelTripletsInCPU->score =new FPX[nPixelTriplets]; - pixelTripletsInCPU->rzChiSquared = new float[nPixelTriplets]; - pixelTripletsInCPU->rPhiChiSquared = new float[nPixelTriplets]; - pixelTripletsInCPU->rPhiChiSquaredInwards = new float[nPixelTriplets]; - - cudaMemcpyAsync(pixelTripletsInCPU->rzChiSquared, pixelTripletsInGPU->rzChiSquared, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(pixelTripletsInCPU->rPhiChiSquared, pixelTripletsInGPU->rPhiChiSquared, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(pixelTripletsInCPU->rPhiChiSquaredInwards, pixelTripletsInGPU->rPhiChiSquaredInwards, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - - cudaMemcpyAsync(pixelTripletsInCPU->tripletIndices, pixelTripletsInGPU->tripletIndices, nPixelTriplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->pixelSegmentIndices, pixelTripletsInGPU->pixelSegmentIndices, nPixelTriplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->pixelRadius, pixelTripletsInGPU->pixelRadius, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->tripletRadius, pixelTripletsInGPU->tripletRadius, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->isDup, pixelTripletsInGPU->isDup, nPixelTriplets * sizeof(bool), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->eta, pixelTripletsInGPU->eta, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->phi, pixelTripletsInGPU->phi, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTripletsInCPU->score, pixelTripletsInGPU->score, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); + pixelTripletsInCPU = new SDL::pixelTripletsBuffer(nPixelTriplets, devHost, queue); + pixelTripletsInCPU->setData(*pixelTripletsInCPU); + + *alpaka::getPtrNative(pixelTripletsInCPU->nPixelTriplets_buf) = nPixelTriplets; + alpaka::memcpy(queue, pixelTripletsInCPU->totOccupancyPixelTriplets_buf, pixelTripletsBuffers->totOccupancyPixelTriplets_buf, 1); + alpaka::memcpy(queue, pixelTripletsInCPU->rzChiSquared_buf, pixelTripletsBuffers->rzChiSquared_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->rPhiChiSquared_buf, pixelTripletsBuffers->rPhiChiSquared_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->rPhiChiSquaredInwards_buf, pixelTripletsBuffers->rPhiChiSquaredInwards_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->tripletIndices_buf, pixelTripletsBuffers->tripletIndices_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->pixelSegmentIndices_buf, pixelTripletsBuffers->pixelSegmentIndices_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->pixelRadius_buf, pixelTripletsBuffers->pixelRadius_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->tripletRadius_buf, pixelTripletsBuffers->tripletRadius_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->isDup_buf, pixelTripletsBuffers->isDup_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->eta_buf, pixelTripletsBuffers->eta_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->phi_buf, pixelTripletsBuffers->phi_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->score_buf, pixelTripletsBuffers->score_buf, nPixelTriplets); + alpaka::wait(queue); } return pixelTripletsInCPU; } -SDL::pixelQuintuplets* SDL::Event::getPixelQuintuplets() +SDL::pixelQuintupletsBuffer* SDL::Event::getPixelQuintuplets() { if(pixelQuintupletsInCPU == nullptr) { - pixelQuintupletsInCPU = new SDL::pixelQuintuplets; + // Get nMemoryLocations parameter to initilize host based quintupletsInCPU + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); + alpaka::wait(queue); - pixelQuintupletsInCPU->nPixelQuintuplets = new int; - pixelQuintupletsInCPU->totOccupancyPixelQuintuplets = new int; - cudaMemcpyAsync(pixelQuintupletsInCPU->nPixelQuintuplets, pixelQuintupletsInGPU->nPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->totOccupancyPixelQuintuplets, pixelQuintupletsInGPU->totOccupancyPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - int nPixelQuintuplets = *(pixelQuintupletsInCPU->nPixelQuintuplets); - - pixelQuintupletsInCPU->pixelIndices = new unsigned int[nPixelQuintuplets]; - pixelQuintupletsInCPU->T5Indices = new unsigned int[nPixelQuintuplets]; - pixelQuintupletsInCPU->isDup = new bool[nPixelQuintuplets]; - pixelQuintupletsInCPU->score = new FPX[nPixelQuintuplets]; - pixelQuintupletsInCPU->rzChiSquared = new float[nPixelQuintuplets]; - pixelQuintupletsInCPU->rPhiChiSquared = new float[nPixelQuintuplets]; - pixelQuintupletsInCPU->rPhiChiSquaredInwards = new float[nPixelQuintuplets]; - - cudaMemcpyAsync(pixelQuintupletsInCPU->rzChiSquared, pixelQuintupletsInGPU->rzChiSquared, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->rPhiChiSquared, pixelQuintupletsInGPU->rPhiChiSquared, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->rPhiChiSquaredInwards, pixelQuintupletsInGPU->rPhiChiSquaredInwards, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->pixelIndices, pixelQuintupletsInGPU->pixelIndices, nPixelQuintuplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->T5Indices, pixelQuintupletsInGPU->T5Indices, nPixelQuintuplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->isDup, pixelQuintupletsInGPU->isDup, nPixelQuintuplets * sizeof(bool), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelQuintupletsInCPU->score, pixelQuintupletsInGPU->score, nPixelQuintuplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); + pixelQuintupletsInCPU = new SDL::pixelQuintupletsBuffer(nPixelQuintuplets, devHost, queue); + pixelQuintupletsInCPU->setData(*pixelQuintupletsInCPU); + + *alpaka::getPtrNative(pixelQuintupletsInCPU->nPixelQuintuplets_buf) = nPixelQuintuplets; + alpaka::memcpy(queue, pixelQuintupletsInCPU->totOccupancyPixelQuintuplets_buf, pixelQuintupletsBuffers->totOccupancyPixelQuintuplets_buf, 1); + alpaka::memcpy(queue, pixelQuintupletsInCPU->rzChiSquared_buf, pixelQuintupletsBuffers->rzChiSquared_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->rPhiChiSquared_buf, pixelQuintupletsBuffers->rPhiChiSquared_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->rPhiChiSquaredInwards_buf, pixelQuintupletsBuffers->rPhiChiSquaredInwards_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->pixelIndices_buf, pixelQuintupletsBuffers->pixelIndices_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->T5Indices_buf, pixelQuintupletsBuffers->T5Indices_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->isDup_buf, pixelQuintupletsBuffers->isDup_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->score_buf, pixelQuintupletsBuffers->score_buf, nPixelQuintuplets); + alpaka::wait(queue); } return pixelQuintupletsInCPU; } diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 31d3da3a..b512b469 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -54,7 +54,9 @@ namespace SDL struct trackCandidates* trackCandidatesInGPU; struct trackCandidatesBuffer* trackCandidatesBuffers; struct pixelTriplets* pixelTripletsInGPU; + struct pixelTripletsBuffer* pixelTripletsBuffers; struct pixelQuintuplets* pixelQuintupletsInGPU; + struct pixelQuintupletsBuffer* pixelQuintupletsBuffers; //CPU interface stuff objectRangesBuffer* rangesInCPU; @@ -66,8 +68,8 @@ namespace SDL modules* modulesInCPU; modules* modulesInCPUFull; quintupletsBuffer* quintupletsInCPU; - pixelTriplets* pixelTripletsInCPU; - pixelQuintuplets* pixelQuintupletsInCPU; + pixelTripletsBuffer* pixelTripletsInCPU; + pixelQuintupletsBuffer* pixelQuintupletsInCPU; int* superbinCPU; int8_t* pixelTypeCPU; @@ -145,10 +147,10 @@ namespace SDL quintupletsBuffer* getQuintuplets(); trackCandidatesBuffer* getTrackCandidates(); trackCandidatesBuffer* getTrackCandidatesInCMSSW(); - pixelTriplets* getPixelTriplets(); + pixelTripletsBuffer* getPixelTriplets(); + pixelQuintupletsBuffer* getPixelQuintuplets(); modules* getModules(); modules* getFullModules(); - pixelQuintuplets* getPixelQuintuplets(); }; //global stuff diff --git a/SDL/PixelTriplet.cu b/SDL/PixelTriplet.cu deleted file mode 100644 index dc0a2496..00000000 --- a/SDL/PixelTriplet.cu +++ /dev/null @@ -1,242 +0,0 @@ -#include "PixelTriplet.cuh" - -SDL::pixelTriplets::pixelTriplets() -{ - pixelSegmentIndices = nullptr; - tripletIndices = nullptr; - nPixelTriplets = nullptr; - totOccupancyPixelTriplets = nullptr; - pixelRadius = nullptr; - tripletRadius = nullptr; - pt = nullptr; - isDup = nullptr; - partOfPT5 = nullptr; - centerX = nullptr; - centerY = nullptr; - hitIndices = nullptr; - lowerModuleIndices = nullptr; - logicalLayers = nullptr; - rzChiSquared = nullptr; - rPhiChiSquared = nullptr; - rPhiChiSquaredInwards = nullptr; -} - -void SDL::pixelTriplets::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,pixelSegmentIndices); - cms::cuda::free_device(dev,tripletIndices); - cms::cuda::free_device(dev,nPixelTriplets); - cms::cuda::free_device(dev,totOccupancyPixelTriplets); - cms::cuda::free_device(dev,pixelRadius); - cms::cuda::free_device(dev,tripletRadius); - cms::cuda::free_device(dev,pt); - cms::cuda::free_device(dev,isDup); - cms::cuda::free_device(dev,partOfPT5); - cms::cuda::free_device(dev, centerX); - cms::cuda::free_device(dev, centerY); - cms::cuda::free_device(dev, hitIndices); - cms::cuda::free_device(dev, logicalLayers); - cms::cuda::free_device(dev, lowerModuleIndices); - cms::cuda::free_device(dev, rPhiChiSquared); - cms::cuda::free_device(dev, rPhiChiSquaredInwards); - cms::cuda::free_device(dev, rzChiSquared); -} - -void SDL::pixelTriplets::freeMemory(cudaStream_t stream) -{ - cudaFree(pixelSegmentIndices); - cudaFree(tripletIndices); - cudaFree(nPixelTriplets); - cudaFree(totOccupancyPixelTriplets); - cudaFree(pixelRadius); - cudaFree(tripletRadius); - cudaFree(pt); - cudaFree(isDup); - cudaFree(partOfPT5); - cudaFree(centerX); - cudaFree(centerY); - cudaFree(logicalLayers); - cudaFree(hitIndices); - cudaFree(lowerModuleIndices); - cudaFree(rPhiChiSquared); - cudaFree(rPhiChiSquaredInwards); - cudaFree(rzChiSquared); -} - -SDL::pixelTriplets::~pixelTriplets() -{ -} - -void SDL::createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsInGPU, unsigned int maxPixelTriplets, cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - pixelTripletsInGPU.pixelSegmentIndices =(unsigned int*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(unsigned int),stream); - pixelTripletsInGPU.tripletIndices =(unsigned int*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(unsigned int),stream); - pixelTripletsInGPU.nPixelTriplets =(int*)cms::cuda::allocate_device(dev,sizeof(int),stream); - pixelTripletsInGPU.totOccupancyPixelTriplets =(int*)cms::cuda::allocate_device(dev,sizeof(int),stream); - pixelTripletsInGPU.pixelRadius =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(FPX),stream); - pixelTripletsInGPU.tripletRadius =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(FPX),stream); - pixelTripletsInGPU.pt =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * 6*sizeof(FPX),stream); - pixelTripletsInGPU.isDup =(bool*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(bool),stream); - pixelTripletsInGPU.partOfPT5 =(bool*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(bool),stream); - pixelTripletsInGPU.centerX = (FPX*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(FPX), stream); - pixelTripletsInGPU.centerY = (FPX*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(FPX), stream); - pixelTripletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(uint16_t) * 5, stream); - pixelTripletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(unsigned int) * 10, stream); - pixelTripletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(uint8_t) * 5, stream); - - pixelTripletsInGPU.rPhiChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream); - pixelTripletsInGPU.rPhiChiSquaredInwards = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream); - pixelTripletsInGPU.rzChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream); -#else - cudaMalloc(&pixelTripletsInGPU.pixelSegmentIndices, maxPixelTriplets * sizeof(unsigned int)); - cudaMalloc(&pixelTripletsInGPU.tripletIndices, maxPixelTriplets * sizeof(unsigned int)); - cudaMalloc(&pixelTripletsInGPU.nPixelTriplets, sizeof(int)); - cudaMalloc(&pixelTripletsInGPU.totOccupancyPixelTriplets, sizeof(int)); - cudaMalloc(&pixelTripletsInGPU.pixelRadius, maxPixelTriplets * sizeof(FPX)); - cudaMalloc(&pixelTripletsInGPU.tripletRadius, maxPixelTriplets * sizeof(FPX)); - cudaMalloc(&pixelTripletsInGPU.pt, maxPixelTriplets * 6*sizeof(FPX)); - cudaMalloc(&pixelTripletsInGPU.isDup, maxPixelTriplets * sizeof(bool)); - cudaMalloc(&pixelTripletsInGPU.partOfPT5, maxPixelTriplets * sizeof(bool)); - cudaMalloc(&pixelTripletsInGPU.centerX, maxPixelTriplets * sizeof(FPX)); - cudaMalloc(&pixelTripletsInGPU.centerY, maxPixelTriplets * sizeof(FPX)); - cudaMalloc(&pixelTripletsInGPU.logicalLayers, maxPixelTriplets * sizeof(uint8_t) * 5); - cudaMalloc(&pixelTripletsInGPU.hitIndices, maxPixelTriplets * sizeof(unsigned int) * 10); - cudaMalloc(&pixelTripletsInGPU.lowerModuleIndices, maxPixelTriplets * sizeof(uint16_t) * 5); - cudaMalloc(&pixelTripletsInGPU.rPhiChiSquared, maxPixelTriplets * sizeof(float)); - cudaMalloc(&pixelTripletsInGPU.rPhiChiSquaredInwards, maxPixelTriplets * sizeof(float)); - cudaMalloc(&pixelTripletsInGPU.rzChiSquared, maxPixelTriplets * sizeof(float)); -#endif - cudaMemsetAsync(pixelTripletsInGPU.nPixelTriplets, 0, sizeof(int),stream); - cudaMemsetAsync(pixelTripletsInGPU.totOccupancyPixelTriplets, 0, sizeof(int),stream); - cudaMemsetAsync(pixelTripletsInGPU.partOfPT5, 0, maxPixelTriplets*sizeof(bool),stream); - cudaStreamSynchronize(stream); - - pixelTripletsInGPU.eta = pixelTripletsInGPU.pt + maxPixelTriplets; - pixelTripletsInGPU.phi = pixelTripletsInGPU.pt + maxPixelTriplets * 2; - pixelTripletsInGPU.eta_pix = pixelTripletsInGPU.pt + maxPixelTriplets *3; - pixelTripletsInGPU.phi_pix = pixelTripletsInGPU.pt + maxPixelTriplets * 4; - pixelTripletsInGPU.score = pixelTripletsInGPU.pt + maxPixelTriplets * 5; -} - -SDL::pixelQuintuplets::pixelQuintuplets() -{ - pixelIndices = nullptr; - T5Indices = nullptr; - nPixelQuintuplets = nullptr; - totOccupancyPixelQuintuplets = nullptr; - isDup = nullptr; - score = nullptr; - pixelRadius = nullptr; - quintupletRadius = nullptr; - centerX = nullptr; - centerY = nullptr; - logicalLayers = nullptr; - hitIndices = nullptr; - lowerModuleIndices = nullptr; -} - -SDL::pixelQuintuplets::~pixelQuintuplets() -{ -} - -void SDL::pixelQuintuplets::freeMemoryCache() -{ - int dev; - cudaGetDevice(&dev); - cms::cuda::free_device(dev,pixelIndices); - cms::cuda::free_device(dev,T5Indices); - cms::cuda::free_device(dev,nPixelQuintuplets); - cms::cuda::free_device(dev,totOccupancyPixelQuintuplets); - cms::cuda::free_device(dev,isDup); - cms::cuda::free_device(dev,score); - cms::cuda::free_device(dev,eta); - cms::cuda::free_device(dev,phi); - cms::cuda::free_device(dev, hitIndices); - cms::cuda::free_device(dev, logicalLayers); - cms::cuda::free_device(dev, lowerModuleIndices); - cms::cuda::free_device(dev, centerX); - cms::cuda::free_device(dev, centerY); - cms::cuda::free_device(dev, pixelRadius); - cms::cuda::free_device(dev, quintupletRadius); - cms::cuda::free_device(dev, rzChiSquared); - cms::cuda::free_device(dev, rPhiChiSquared); - cms::cuda::free_device(dev, rPhiChiSquaredInwards); -} - -void SDL::pixelQuintuplets::freeMemory(cudaStream_t stream) -{ - cudaFree(pixelIndices); - cudaFree(T5Indices); - cudaFree(nPixelQuintuplets); - cudaFree(totOccupancyPixelQuintuplets); - cudaFree(isDup); - cudaFree(score); - cudaFree(eta); - cudaFree(phi); - - cudaFree(logicalLayers); - cudaFree(hitIndices); - cudaFree(lowerModuleIndices); - cudaFree(pixelRadius); - cudaFree(quintupletRadius); - cudaFree(centerX); - cudaFree(centerY); - cudaFree(rzChiSquared); - cudaFree(rPhiChiSquared); - cudaFree(rPhiChiSquaredInwards); - cudaStreamSynchronize(stream); -} - -void SDL::createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets,cudaStream_t stream) -{ -#ifdef CACHE_ALLOC - int dev; - cudaGetDevice(&dev); - pixelQuintupletsInGPU.pixelIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(unsigned int),stream); - pixelQuintupletsInGPU.T5Indices = (unsigned int*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(unsigned int),stream); - pixelQuintupletsInGPU.nPixelQuintuplets = (int*)cms::cuda::allocate_device(dev,sizeof(int),stream); - pixelQuintupletsInGPU.totOccupancyPixelQuintuplets = (int*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream); - pixelQuintupletsInGPU.isDup = (bool*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(bool),stream); - pixelQuintupletsInGPU.score = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream); - pixelQuintupletsInGPU.eta = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream); - pixelQuintupletsInGPU.phi = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream); - pixelQuintupletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 14 * sizeof(unsigned int), stream); - pixelQuintupletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 7 * sizeof(uint8_t), stream); - pixelQuintupletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 7 * sizeof(uint16_t), stream); - pixelQuintupletsInGPU.centerX = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream); - pixelQuintupletsInGPU.centerY = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream); - pixelQuintupletsInGPU.pixelRadius = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream); - pixelQuintupletsInGPU.quintupletRadius = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream); - pixelQuintupletsInGPU.rzChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream); - pixelQuintupletsInGPU.rPhiChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream); - pixelQuintupletsInGPU.rPhiChiSquaredInwards = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream); -#else - cudaMalloc(&pixelQuintupletsInGPU.pixelIndices, maxPixelQuintuplets * sizeof(unsigned int)); - cudaMalloc(&pixelQuintupletsInGPU.T5Indices, maxPixelQuintuplets * sizeof(unsigned int)); - cudaMalloc(&pixelQuintupletsInGPU.nPixelQuintuplets, sizeof(int)); - cudaMalloc(&pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, sizeof(int)); - cudaMalloc(&pixelQuintupletsInGPU.isDup, maxPixelQuintuplets * sizeof(bool)); - cudaMalloc(&pixelQuintupletsInGPU.score, maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.eta , maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.phi , maxPixelQuintuplets * sizeof(FPX)); - - cudaMalloc(&pixelQuintupletsInGPU.logicalLayers, maxPixelQuintuplets * 7 *sizeof(uint8_t)); - cudaMalloc(&pixelQuintupletsInGPU.hitIndices, maxPixelQuintuplets * 14 * sizeof(unsigned int)); - cudaMalloc(&pixelQuintupletsInGPU.lowerModuleIndices, maxPixelQuintuplets * 7 * sizeof(uint16_t)); - cudaMalloc(&pixelQuintupletsInGPU.pixelRadius, maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.quintupletRadius, maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.centerX, maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.centerY, maxPixelQuintuplets * sizeof(FPX)); - cudaMalloc(&pixelQuintupletsInGPU.rzChiSquared, maxPixelQuintuplets * sizeof(unsigned int)); - cudaMalloc(&pixelQuintupletsInGPU.rPhiChiSquared, maxPixelQuintuplets * sizeof(unsigned int)); - cudaMalloc(&pixelQuintupletsInGPU.rPhiChiSquaredInwards, maxPixelQuintuplets * sizeof(unsigned int)); -#endif - cudaMemsetAsync(pixelQuintupletsInGPU.nPixelQuintuplets, 0, sizeof(int),stream); - cudaMemsetAsync(pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, 0, sizeof(int),stream); - cudaStreamSynchronize(stream); -} diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 422d8959..c40f8283 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -11,12 +11,13 @@ namespace SDL { - struct pixelTriplets //one pixel segment, one outer tracker triplet! + // One pixel segment, one outer tracker triplet! + struct pixelTriplets { unsigned int* pixelSegmentIndices; unsigned int* tripletIndices; - int* nPixelTriplets; //size 1 - int* totOccupancyPixelTriplets; //size 1 + int* nPixelTriplets; + int* totOccupancyPixelTriplets; float* pixelRadiusError; float* rPhiChiSquared; @@ -40,13 +41,96 @@ namespace SDL FPX* centerX; FPX* centerY; - pixelTriplets(); - ~pixelTriplets(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); + template + void setData(TBuff& pixelTripletsBuffer) + { + pixelSegmentIndices = alpaka::getPtrNative(pixelTripletsBuffer.pixelSegmentIndices_buf); + tripletIndices = alpaka::getPtrNative(pixelTripletsBuffer.tripletIndices_buf); + nPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.nPixelTriplets_buf); + totOccupancyPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.totOccupancyPixelTriplets_buf); + pixelRadius = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadius_buf); + tripletRadius = alpaka::getPtrNative(pixelTripletsBuffer.tripletRadius_buf); + pt = alpaka::getPtrNative(pixelTripletsBuffer.pt_buf); + eta = alpaka::getPtrNative(pixelTripletsBuffer.eta_buf); + phi = alpaka::getPtrNative(pixelTripletsBuffer.phi_buf); + eta_pix = alpaka::getPtrNative(pixelTripletsBuffer.eta_pix_buf); + phi_pix = alpaka::getPtrNative(pixelTripletsBuffer.phi_pix_buf); + score = alpaka::getPtrNative(pixelTripletsBuffer.score_buf); + isDup = alpaka::getPtrNative(pixelTripletsBuffer.isDup_buf); + partOfPT5 = alpaka::getPtrNative(pixelTripletsBuffer.partOfPT5_buf); + logicalLayers = alpaka::getPtrNative(pixelTripletsBuffer.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(pixelTripletsBuffer.hitIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(pixelTripletsBuffer.lowerModuleIndices_buf); + centerX = alpaka::getPtrNative(pixelTripletsBuffer.centerX_buf); + centerY = alpaka::getPtrNative(pixelTripletsBuffer.centerY_buf); + pixelRadiusError = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadiusError_buf); + rPhiChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquared_buf); + rPhiChiSquaredInwards = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquaredInwards_buf); + rzChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rzChiSquared_buf); + } }; - void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream); + template + struct pixelTripletsBuffer : pixelTriplets + { + Buf pixelSegmentIndices_buf; + Buf tripletIndices_buf; + Buf nPixelTriplets_buf; + Buf totOccupancyPixelTriplets_buf; + Buf pixelRadius_buf; + Buf tripletRadius_buf; + Buf pt_buf; + Buf eta_buf; + Buf phi_buf; + Buf eta_pix_buf; + Buf phi_pix_buf; + Buf score_buf; + Buf isDup_buf; + Buf partOfPT5_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf lowerModuleIndices_buf; + Buf centerX_buf; + Buf centerY_buf; + Buf pixelRadiusError_buf; + Buf rPhiChiSquared_buf; + Buf rPhiChiSquaredInwards_buf; + Buf rzChiSquared_buf; + + template + pixelTripletsBuffer(unsigned int maxPixelTriplets, + TDevAcc const & devAccIn, + TQueue& queue) : + pixelSegmentIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + tripletIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + nPixelTriplets_buf(allocBufWrapper(devAccIn, 1)), + totOccupancyPixelTriplets_buf(allocBufWrapper(devAccIn, 1)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + tripletRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + pt_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + eta_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + phi_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + eta_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + phi_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + score_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*10)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + pixelRadiusError_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets)) + { + alpaka::memset(queue, nPixelTriplets_buf, 0, 1); + alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0, 1); + alpaka::memset(queue, partOfPT5_buf, 0, maxPixelTriplets); + alpaka::wait(queue); + } + }; ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score) { @@ -1376,14 +1460,80 @@ namespace SDL float* rPhiChiSquared; float* rPhiChiSquaredInwards; - pixelQuintuplets(); - ~pixelQuintuplets(); - void freeMemory(cudaStream_t stream); - void freeMemoryCache(); - + template + void setData(TBuff& pixelQuintupletsBuffer) + { + pixelIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelIndices_buf); + T5Indices = alpaka::getPtrNative(pixelQuintupletsBuffer.T5Indices_buf); + nPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.nPixelQuintuplets_buf); + totOccupancyPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.totOccupancyPixelQuintuplets_buf); + isDup = alpaka::getPtrNative(pixelQuintupletsBuffer.isDup_buf); + score = alpaka::getPtrNative(pixelQuintupletsBuffer.score_buf); + eta = alpaka::getPtrNative(pixelQuintupletsBuffer.eta_buf); + phi = alpaka::getPtrNative(pixelQuintupletsBuffer.phi_buf); + logicalLayers = alpaka::getPtrNative(pixelQuintupletsBuffer.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.hitIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.lowerModuleIndices_buf); + pixelRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelRadius_buf); + quintupletRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.quintupletRadius_buf); + centerX = alpaka::getPtrNative(pixelQuintupletsBuffer.centerX_buf); + centerY = alpaka::getPtrNative(pixelQuintupletsBuffer.centerY_buf); + rzChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rzChiSquared_buf); + rPhiChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquared_buf); + rPhiChiSquaredInwards = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquaredInwards_buf); + } }; - void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream); + template + struct pixelQuintupletsBuffer : pixelQuintuplets + { + Buf pixelIndices_buf; + Buf T5Indices_buf; + Buf nPixelQuintuplets_buf; + Buf totOccupancyPixelQuintuplets_buf; + Buf isDup_buf; + Buf score_buf; + Buf eta_buf; + Buf phi_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf lowerModuleIndices_buf; + Buf pixelRadius_buf; + Buf quintupletRadius_buf; + Buf centerX_buf; + Buf centerY_buf; + Buf rzChiSquared_buf; + Buf rPhiChiSquared_buf; + Buf rPhiChiSquaredInwards_buf; + + template + pixelQuintupletsBuffer(unsigned int maxPixelQuintuplets, + TDevAcc const & devAccIn, + TQueue& queue) : + pixelIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + T5Indices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + nPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1)), + totOccupancyPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + score_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + eta_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + phi_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*14)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + quintupletRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)) + { + alpaka::memset(queue, nPixelQuintuplets_buf, 0, 1); + alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0, 1); + alpaka::wait(queue); + } + }; ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY) { diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index 60984428..f139982b 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -241,7 +241,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3) { - SDL::pixelTriplets& pixelTriplets_ = *(event->getPixelTriplets()); + SDL::pixelTripletsBuffer& pixelTriplets_ = *(event->getPixelTriplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; @@ -341,7 +341,7 @@ std::tuple, std::vector> getHitIdxsAndHi //____________________________________________________________________________________________ unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5) { - SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets()); + SDL::pixelQuintupletsBuffer& pixelQuintuplets_ = *(event->getPixelQuintuplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::modules& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; @@ -351,7 +351,7 @@ unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5) //____________________________________________________________________________________________ unsigned int getT5FrompT5(SDL::Event* event, unsigned int pT5) { - SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets()); + SDL::pixelQuintupletsBuffer& pixelQuintuplets_ = *(event->getPixelQuintuplets()); return pixelQuintuplets_.T5Indices[pT5]; } diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 3a580c15..33f90068 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -305,7 +305,7 @@ void setOptionalOutputBranches(SDL::Event* event) void setPixelQuintupletOutputBranches(SDL::Event* event) { // ============ pT5 ============= - SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); + SDL::pixelQuintupletsBuffer& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); SDL::quintupletsBuffer& quintupletsInGPU = (*event->getQuintuplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::modules& modulesInGPU = (*event->getModules()); @@ -473,7 +473,7 @@ void setQuintupletOutputBranches(SDL::Event* event) //________________________________________________________________________________________________________________________________ void setPixelTripletOutputBranches(SDL::Event* event) { - SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets()); + SDL::pixelTripletsBuffer& pixelTripletsInGPU = (*event->getPixelTriplets()); SDL::tripletsBuffer& tripletsInGPU = *(event->getTriplets()); SDL::modules& modulesInGPU = *(event->getModules()); SDL::segmentsBuffer& segmentsInGPU = *(event->getSegments()); From edcd381311a2321094a3a578b866881e9adc1084 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 11 Jun 2023 15:39:53 -0400 Subject: [PATCH 27/44] move pixelmap + various host/copies + general cleanup --- SDL/Constants.cuh | 2 + SDL/EndcapGeometry.cuh | 2 + SDL/Event.cu | 414 ++++++++++++++++++++--------------------- SDL/Event.cuh | 2 +- SDL/Kernels.cuh | 11 +- SDL/MiniDoublet.cuh | 69 +++---- SDL/Module.cu | 126 +++++-------- SDL/Module.cuh | 28 ++- SDL/PixelTriplet.cuh | 2 + SDL/Quintuplet.cuh | 3 + SDL/Segment.cuh | 3 + SDL/TrackCandidate.cuh | 7 + SDL/Triplet.cuh | 3 + 13 files changed, 341 insertions(+), 331 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 66a874a7..f6adc16a 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -102,6 +102,8 @@ const unsigned int N_MAX_TRACK_CANDIDATE_EXTENSIONS = 200000; const unsigned int N_MAX_TRACK_EXTENSIONS_PER_TC = 30; const unsigned int N_MAX_T3T3_TRACK_EXTENSIONS = 40000; +const unsigned int size_superbins = 45000; + namespace SDL { //defining the constant host device variables right up here diff --git a/SDL/EndcapGeometry.cuh b/SDL/EndcapGeometry.cuh index 28364aed..4ad71b40 100644 --- a/SDL/EndcapGeometry.cuh +++ b/SDL/EndcapGeometry.cuh @@ -9,6 +9,8 @@ #include #include +#include "Constants.cuh" + namespace SDL { class EndcapGeometry diff --git a/SDL/Event.cu b/SDL/Event.cu index c9a5871d..27a96ac7 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -1,7 +1,7 @@ #include "Event.cuh" struct SDL::modules* SDL::modulesInGPU = nullptr; -struct SDL::pixelMap* SDL::pixelMapping = nullptr; +std::unique_ptr SDL::pixelMapping = std::make_unique(); uint16_t SDL::nModules; uint16_t SDL::nLowerModules; @@ -279,18 +279,15 @@ void SDL::initModules(const char* moduleMetaDataFilePath) if(modulesInGPU == nullptr) { cudaMallocHost(&modulesInGPU, sizeof(struct SDL::modules)); - cudaMallocHost(&pixelMapping, sizeof(struct SDL::pixelMap)); //nModules gets filled here loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath); - cudaStreamSynchronize(default_stream); } } void SDL::cleanModules() { - freeModules(*modulesInGPU, *pixelMapping); + freeModules(*modulesInGPU); cudaFreeHost(modulesInGPU); - cudaFreeHost(pixelMapping); } void SDL::Event::addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple) @@ -298,7 +295,7 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: // Use the actual number of hits instead of a max. const int nHits = x.size(); - // Needed for the memcpy to hitsInGPU below. + // Needed for the memcpy to hitsInGPU below. Will be replaced with a View. auto nHits_buf = allocBufWrapper(devHost, 1); *alpaka::getPtrNative(nHits_buf) = nHits; @@ -375,7 +372,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st if(mdsInGPU == nullptr) { - unsigned int nTotalMDs; cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream); Vec const threadsPerBlockCreateMD(static_cast(1), static_cast(1), static_cast(1024)); @@ -392,6 +388,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::enqueue(queue, createMDArrayRangesGPUTask); alpaka::wait(queue); + unsigned int nTotalMDs; cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; @@ -527,6 +524,7 @@ void SDL::Event::addMiniDoubletsToEventExplicit() } } + cms::cuda::free_host(nMDsCPU); cms::cuda::free_host(module_subdets); cms::cuda::free_host(module_layers); @@ -561,6 +559,7 @@ void SDL::Event::addSegmentsToEventExplicit() } } } + cms::cuda::free_host(nSegmentsCPU); cms::cuda::free_host(module_subdets); cms::cuda::free_host(module_layers); @@ -568,8 +567,6 @@ void SDL::Event::addSegmentsToEventExplicit() void SDL::Event::createMiniDoublets() { - //hardcoded range numbers for this will come from studies! - unsigned int nTotalMDs; cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream); Vec const threadsPerBlockCreateMD(static_cast(1), static_cast(1), static_cast(1024)); @@ -586,9 +583,14 @@ void SDL::Event::createMiniDoublets() alpaka::enqueue(queue, createMDArrayRangesGPUTask); alpaka::wait(queue); - cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - nTotalMDs+=N_MAX_PIXEL_MD_PER_MODULES; + auto nTotalMDs_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf, 1); + alpaka::wait(queue); + + unsigned int nTotalMDs = *alpaka::getPtrNative(nTotalMDs_buf); + + nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; if(mdsInGPU == nullptr) { @@ -597,38 +599,6 @@ void SDL::Event::createMiniDoublets() mdsInGPU->setData(*miniDoubletsBuffers); } - int maxThreadsPerModule=0; - int* module_hitRanges; - module_hitRanges = (int*)cms::cuda::allocate_host(nModules* 2*sizeof(int), stream); - cudaMemcpyAsync(module_hitRanges,hitsInGPU->hitRanges,nModules*2*sizeof(int),cudaMemcpyDeviceToHost,stream); - bool* module_isLower; - module_isLower = (bool*)cms::cuda::allocate_host(nModules*sizeof(bool), stream); - cudaMemcpyAsync(module_isLower,modulesInGPU->isLower,nModules*sizeof(bool),cudaMemcpyDeviceToHost,stream); - bool* module_isInverted; - module_isInverted = (bool*)cms::cuda::allocate_host(nModules*sizeof(bool), stream); - cudaMemcpyAsync(module_isInverted,modulesInGPU->isInverted,nModules*sizeof(bool),cudaMemcpyDeviceToHost,stream); - int* module_partnerModuleIndices; - module_partnerModuleIndices = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream); - cudaMemcpyAsync(module_partnerModuleIndices, modulesInGPU->partnerModuleIndices, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - for (uint16_t lowerModuleIndex=0; lowerModuleIndex (nLowerHits*nUpperHits) ? maxThreadsPerModule : nLowerHits*nUpperHits; - } - } - cms::cuda::free_host(module_hitRanges); - cms::cuda::free_host(module_partnerModuleIndices); - cms::cuda::free_host(module_isLower); - cms::cuda::free_host(module_isInverted); - Vec const threadsPerBlockCreateMDInGPU(static_cast(1), static_cast(16), static_cast(32)); Vec const blocksPerGridCreateMDInGPU(static_cast(1), static_cast(MAX_BLOCKS), static_cast(1)); WorkDiv const createMiniDoubletsInGPUv2_workDiv(blocksPerGridCreateMDInGPU, threadsPerBlockCreateMDInGPU, elementsPerThread); @@ -715,8 +685,6 @@ void SDL::Event::createTriplets() { if(tripletsInGPU == nullptr) { - unsigned int maxTriplets; - Vec const threadsPerBlockCreateTrip(static_cast(1), static_cast(1), static_cast(1024)); Vec const blocksPerGridCreateTrip(static_cast(1), static_cast(1), static_cast(1)); WorkDiv const createTripletArrayRanges_workDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); @@ -732,33 +700,43 @@ void SDL::Event::createTriplets() alpaka::enqueue(queue, createTripletArrayRangesTask); alpaka::wait(queue); - cudaMemcpyAsync(&maxTriplets,rangesInGPU->device_nTotalTrips,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + // TODO: Why are we pulling this back down only to put it back on the device in a new struct? + auto maxTriplets_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf, 1); + alpaka::wait(queue); tripletsInGPU = new SDL::triplets(); - tripletsBuffers = new SDL::tripletsBuffer(maxTriplets, nLowerModules, devAcc, queue); + tripletsBuffers = new SDL::tripletsBuffer(*alpaka::getPtrNative(maxTriplets_buf), nLowerModules, devAcc, queue); tripletsInGPU->setData(*tripletsBuffers); - cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, tripletsBuffers->nMemoryLocations_buf, maxTriplets_buf, 1); + alpaka::wait(queue); } - //TODO:Move this also inside the ranges function - uint16_t nonZeroModules=0; - unsigned int max_InnerSeg=0; - uint16_t *index = (uint16_t*)malloc(nLowerModules*sizeof(unsigned int)); - uint16_t *index_gpu; - index_gpu = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules*sizeof(uint16_t), stream); - unsigned int *nSegments = (unsigned int*)malloc(nLowerModules*sizeof(unsigned int)); - cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + uint16_t nonZeroModules = 0; + unsigned int max_InnerSeg = 0; + + // Allocate host index + auto index_buf = allocBufWrapper(devHost, nLowerModules); + uint16_t *index = alpaka::getPtrNative(index_buf); + + // Allocate device index + auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules); + + // Allocate and copy nSegments from device to host + auto nSegments_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules); + alpaka::wait(queue); + int *nSegments = alpaka::getPtrNative(nSegments_buf); + uint16_t* module_nConnectedModules; module_nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nLowerModules* sizeof(uint16_t), stream); cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); cudaStreamSynchronize(stream); - for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex (1), static_cast(16), static_cast(16)); Vec const blocksPerGridCreateTrip(static_cast(MAX_BLOCKS), static_cast(1), static_cast(1)); @@ -786,7 +767,7 @@ void SDL::Event::createTriplets() *segmentsInGPU, *tripletsInGPU, *rangesInGPU, - index_gpu, + alpaka::getPtrNative(index_gpu_buf), nonZeroModules)); alpaka::enqueue(queue, createTripletsInGPUv2Task); @@ -806,10 +787,6 @@ void SDL::Event::createTriplets() alpaka::enqueue(queue, addTripletRangesToEventExplicitTask); alpaka::wait(queue); - free(nSegments); - free(index); - cms::cuda::free_device(dev, index_gpu); - if(addObjects) { addTripletsToEventExplicit(); @@ -818,8 +795,6 @@ void SDL::Event::createTriplets() void SDL::Event::createTrackCandidates() { - uint16_t nEligibleModules; - cudaMemcpyAsync(&nEligibleModules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); if(trackCandidatesInGPU == nullptr) { trackCandidatesInGPU = new SDL::trackCandidates(); @@ -827,6 +802,11 @@ void SDL::Event::createTrackCandidates() trackCandidatesInGPU->setData(*trackCandidatesBuffers); } + // Pull nEligibleT5Modules from the device. + auto nEligibleModules_buf = allocBufWrapper(devHost, 1); + alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1); + uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf); + Vec const threadsPerBlock_crossCleanpT3(static_cast(1), static_cast(16), static_cast(64)); Vec const blocksPerGrid_crossCleanpT3(static_cast(1), static_cast(4), static_cast(20)); WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, blocksPerGrid_crossCleanpT3, elementsPerThread); @@ -843,7 +823,6 @@ void SDL::Event::createTrackCandidates() alpaka::enqueue(queue, crossCleanpT3Task); - //adding objects Vec const threadsPerBlock_addpT3asTrackCandidatesInGPU(static_cast(1), static_cast(1), static_cast(512)); Vec const blocksPerGrid_addpT3asTrackCandidatesInGPU(static_cast(1), static_cast(1), static_cast(1)); WorkDiv const addpT3asTrackCandidatesInGPU_workDiv(blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread); @@ -962,30 +941,27 @@ void SDL::Event::createPixelTriplets() pixelTripletsInGPU->setData(*pixelTripletsBuffers); } - unsigned int pixelModuleIndex = nLowerModules; - int* superbins; - int8_t* pixelTypes; - unsigned int *nTriplets; - unsigned int nInnerSegments = 0; - cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[pixelModuleIndex]), sizeof(int), cudaMemcpyDeviceToHost,stream); - nTriplets = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream); - cudaMemcpyAsync(nTriplets, tripletsInGPU->nTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - superbins = (int*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int), stream); - pixelTypes = (int8_t*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t), stream); - - cudaMemcpyAsync(superbins,segmentsInGPU->superbin,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTypes,segmentsInGPU->pixelType,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t),cudaMemcpyDeviceToHost,stream); - - unsigned int* connectedPixelSize_host; - unsigned int* connectedPixelIndex_host; - connectedPixelSize_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream); - connectedPixelIndex_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream); - unsigned int* connectedPixelSize_dev; - unsigned int* connectedPixelIndex_dev; - connectedPixelSize_dev = (unsigned int*)cms::cuda::allocate_device(dev, nInnerSegments*sizeof(unsigned int), stream); - connectedPixelIndex_dev = (unsigned int*)cms::cuda::allocate_device(dev, nInnerSegments*sizeof(unsigned int), stream); + unsigned int nInnerSegments; + cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(int), cudaMemcpyDeviceToHost,stream); + + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::wait(queue); + + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments); + + int* superbins = alpaka::getPtrNative(superbins_buf); + int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); + unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); + unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); + alpaka::wait(queue); - cudaStreamSynchronize(stream); int pixelIndexOffsetPos = pixelMapping->connectedPixelsIndex[44999] + pixelMapping->connectedPixelsSizes[44999]; int pixelIndexOffsetNeg = pixelMapping->connectedPixelsIndexPos[44999] + pixelMapping->connectedPixelsSizes[44999] + pixelIndexOffsetPos; @@ -993,8 +969,8 @@ void SDL::Event::createPixelTriplets() // the current selection still leaves a significant fraction of unmatchable pLSs for (unsigned int i = 0; i < nInnerSegments; i++) {// loop over # pLS - int8_t pixelType = pixelTypes[i];// get pixel type for this pLS - int superbin = superbins[i]; //get superbin for this pixel + int8_t pixelType = pixelTypes[i];// Get pixel type for this pLS + int superbin = superbins[i]; // Get superbin for this pixel if((superbin < 0) or (superbin >= 45000) or (pixelType > 2) or (pixelType < 0)) { connectedPixelSize_host[i] = 0; @@ -1002,37 +978,30 @@ void SDL::Event::createPixelTriplets() continue; } - if(pixelType ==0) - { // used pixel type to select correct size-index arrays - connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizes[superbin]; //number of connected modules to this pixel + // Used pixel type to select correct size-index arrays + if(pixelType == 0) + { + connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizes[superbin]; // number of connected modules to this pixel auto connectedIdxBase = pixelMapping->connectedPixelsIndex[superbin]; - connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected modules for this superbin in map - // printf("i %d out of nInnerSegments %d type %d superbin %d connectedPixelIndex %d connectedPixelSize %d\n", - // i, nInnerSegments, pixelType, superbin, connectedPixelIndex_host[i], connectedPixelSize_host[i]); + connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected modules for this superbin in map } - else if(pixelType ==1) + else if(pixelType == 1) { - connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; //number of pixel connected modules + connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; // number of pixel connected modules auto connectedIdxBase = pixelMapping->connectedPixelsIndexPos[superbin]+pixelIndexOffsetPos; - connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected pixel modules + connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules } - else if(pixelType ==2) + else if(pixelType == 2) { - connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules + connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; // number of pixel connected modules auto connectedIdxBase = pixelMapping->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; - connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected pixel modules + connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules } } - cudaMemcpyAsync(connectedPixelSize_dev, connectedPixelSize_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(connectedPixelIndex_dev, connectedPixelIndex_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); - - cms::cuda::free_host(connectedPixelSize_host); - cms::cuda::free_host(connectedPixelIndex_host); - cms::cuda::free_host(superbins); - cms::cuda::free_host(pixelTypes); - cms::cuda::free_host(nTriplets); + alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); + alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); + alpaka::wait(queue); Vec const threadsPerBlock(static_cast(1), static_cast(4), static_cast(32)); Vec const blocksPerGrid(static_cast(16 /* above median of connected modules*/), static_cast(4096), static_cast(1)); @@ -1048,21 +1017,20 @@ void SDL::Event::createPixelTriplets() *segmentsInGPU, *tripletsInGPU, *pixelTripletsInGPU, - connectedPixelSize_dev, - connectedPixelIndex_dev, + alpaka::getPtrNative(connectedPixelSize_dev_buf), + alpaka::getPtrNative(connectedPixelIndex_dev_buf), nInnerSegments)); alpaka::enqueue(queue, createPixelTripletsInGPUFromMapv2Task); alpaka::wait(queue); - cms::cuda::free_device(dev, connectedPixelSize_dev); - cms::cuda::free_device(dev, connectedPixelIndex_dev); - #ifdef Warnings - int nPixelTriplets; - cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - std::cout<<"number of pixel triplets = "<(devHost, 1); + + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); + alpaka::wait(queue); + + std::cout << "number of pixel triplets = " << *alpaka::getPtrNative(nPixelTriplets_buf) << std::endl; #endif //pT3s can be cleaned here because they're not used in making pT5s! @@ -1084,9 +1052,6 @@ void SDL::Event::createPixelTriplets() void SDL::Event::createQuintuplets() { - uint16_t nEligibleT5Modules = 0; - unsigned int nTotalQuintuplets; - Vec const threadsPerBlockCreateQuints(static_cast(1), static_cast(1), static_cast(1024)); Vec const blocksPerGridCreateQuints(static_cast(1), static_cast(1), static_cast(1)); WorkDiv const createEligibleModulesListForQuintupletsGPU_workDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread); @@ -1102,9 +1067,15 @@ void SDL::Event::createQuintuplets() alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask); alpaka::wait(queue); - cudaMemcpyAsync(&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(&nTotalQuintuplets,rangesInGPU->device_nTotalQuints,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nEligibleT5Modules_buf = allocBufWrapper(devHost, 1); + auto nTotalQuintuplets_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf, 1); + alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf, 1); + alpaka::wait(queue); + + uint16_t nEligibleT5Modules = *alpaka::getPtrNative(nEligibleT5Modules_buf); + unsigned int nTotalQuintuplets = *alpaka::getPtrNative(nTotalQuintuplets_buf); if(quintupletsInGPU == nullptr) { @@ -1112,8 +1083,8 @@ void SDL::Event::createQuintuplets() quintupletsBuffers = new SDL::quintupletsBuffer(nTotalQuintuplets, nLowerModules, devAcc, queue); quintupletsInGPU->setData(*quintupletsBuffers); - cudaMemcpyAsync(quintupletsInGPU->nMemoryLocations, &nTotalQuintuplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, quintupletsBuffers->nMemoryLocations_buf, nTotalQuintuplets_buf, 1); + alpaka::wait(queue); } Vec const threadsPerBlockQuints(static_cast(1), static_cast(8), static_cast(32)); @@ -1200,64 +1171,57 @@ void SDL::Event::createPixelQuintuplets() trackCandidatesInGPU = new SDL::trackCandidates(); trackCandidatesBuffers = new SDL::trackCandidatesBuffer(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue); trackCandidatesInGPU->setData(*trackCandidatesBuffers); - } - - unsigned int pixelModuleIndex; - int* superbins; - int8_t* pixelTypes; - int *nQuintuplets; + } - unsigned int* connectedPixelSize_host; - unsigned int* connectedPixelIndex_host; - unsigned int* connectedPixelSize_dev; - unsigned int* connectedPixelIndex_dev; + unsigned int nInnerSegments; + cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - nQuintuplets = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(int), stream); - cudaMemcpyAsync(nQuintuplets, quintupletsInGPU->nQuintuplets, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream); + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - superbins = (int*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int), stream); - pixelTypes = (int8_t*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t), stream); + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + alpaka::wait(queue); - cudaMemcpyAsync(superbins,segmentsInGPU->superbin,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(pixelTypes,segmentsInGPU->pixelType,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t),cudaMemcpyDeviceToHost,stream); + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments); - cudaStreamSynchronize(stream); - pixelModuleIndex = nLowerModules; - unsigned int nInnerSegments = 0; - cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[pixelModuleIndex]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - connectedPixelSize_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream); - connectedPixelIndex_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream); - connectedPixelSize_dev = (unsigned int*)cms::cuda::allocate_device(dev,nInnerSegments* sizeof(unsigned int),stream); - connectedPixelIndex_dev = (unsigned int*)cms::cuda::allocate_device(dev,nInnerSegments* sizeof(unsigned int),stream); - cudaStreamSynchronize(stream); + int* superbins = alpaka::getPtrNative(superbins_buf); + int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); + unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); + unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); + alpaka::wait(queue); int pixelIndexOffsetPos = pixelMapping->connectedPixelsIndex[44999] + pixelMapping->connectedPixelsSizes[44999]; int pixelIndexOffsetNeg = pixelMapping->connectedPixelsIndexPos[44999] + pixelMapping->connectedPixelsSizes[44999] + pixelIndexOffsetPos; + // Loop over # pLS for (unsigned int i = 0; i < nInnerSegments; i++) - {// loop over # pLS - int8_t pixelType = pixelTypes[i];// get pixel type for this pLS - int superbin = superbins[i]; //get superbin for this pixel + { + int8_t pixelType = pixelTypes[i];// Get pixel type for this pLS + int superbin = superbins[i]; // Get superbin for this pixel if((superbin < 0) or (superbin >= 45000) or (pixelType > 2) or (pixelType < 0)) { connectedPixelIndex_host[i] = 0; connectedPixelSize_host[i] = 0; continue; } - - if(pixelType ==0) - { // used pixel type to select correct size-index arrays + // Used pixel type to select correct size-index arrays + if(pixelType == 0) + { connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizes[superbin]; //number of connected modules to this pixel unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndex[superbin]; connectedPixelIndex_host[i] = connectedIdxBase; } - else if(pixelType ==1) + else if(pixelType == 1) { connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; //number of pixel connected modules unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndexPos[superbin]+pixelIndexOffsetPos; connectedPixelIndex_host[i] = connectedIdxBase; } - else if(pixelType ==2) + else if(pixelType == 2) { connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; @@ -1265,9 +1229,9 @@ void SDL::Event::createPixelQuintuplets() } } - cudaMemcpyAsync(connectedPixelSize_dev, connectedPixelSize_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(connectedPixelIndex_dev, connectedPixelIndex_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); + alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); + alpaka::wait(queue); Vec const threadsPerBlockCreatePixQuints(static_cast(1), static_cast(16), static_cast(16)); Vec const blocksPerGridCreatePixQuints(static_cast(16), static_cast(MAX_BLOCKS), static_cast(1)); @@ -1283,21 +1247,12 @@ void SDL::Event::createPixelQuintuplets() *tripletsInGPU, *quintupletsInGPU, *pixelQuintupletsInGPU, - connectedPixelSize_dev, - connectedPixelIndex_dev, + alpaka::getPtrNative(connectedPixelSize_dev_buf), + alpaka::getPtrNative(connectedPixelIndex_dev_buf), nInnerSegments, *rangesInGPU)); alpaka::enqueue(queue, createPixelQuintupletsInGPUFromMapv2Task); - alpaka::wait(queue); - - cms::cuda::free_host(superbins); - cms::cuda::free_host(pixelTypes); - cms::cuda::free_host(nQuintuplets); - cms::cuda::free_host(connectedPixelSize_host); - cms::cuda::free_host(connectedPixelIndex_host); - cms::cuda::free_device(dev, connectedPixelSize_dev); - cms::cuda::free_device(dev, connectedPixelIndex_dev); Vec const threadsPerBlockDupPix(static_cast(1), static_cast(16), static_cast(16)); Vec const blocksPerGridDupPix(static_cast(1), static_cast(MAX_BLOCKS), static_cast(1)); @@ -1311,7 +1266,6 @@ void SDL::Event::createPixelQuintuplets() false)); alpaka::enqueue(queue, removeDupPixelQuintupletsInGPUFromMapTask); - alpaka::wait(queue); Vec const threadsPerBlockAddpT5asTrackCan(static_cast(1), static_cast(1), static_cast(256)); Vec const blocksPerGridAddpT5asTrackCan(static_cast(1), static_cast(1), static_cast(1)); @@ -1329,12 +1283,15 @@ void SDL::Event::createPixelQuintuplets() alpaka::enqueue(queue, addpT5asTrackCandidateInGPUTask); alpaka::wait(queue); + #ifdef Warnings - int nPixelQuintuplets; - cudaMemcpyAsync(&nPixelQuintuplets, &(pixelQuintupletsInGPU->nPixelQuintuplets), sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); - std::cout<<"number of pixel quintuplets = "<(devHost, 1); + + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); + alpaka::wait(queue); + + std::cout << "number of pixel quintuplets = " << *alpaka::getPtrNative(nPixelQuintuplets_buf) << std::endl; +#endif } void SDL::Event::addQuintupletsToEventExplicit() @@ -1541,17 +1498,25 @@ unsigned int SDL::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer) int SDL::Event::getNumberOfPixelTriplets() { - int nPixelTriplets; - cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); + alpaka::wait(queue); + + int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); + return nPixelTriplets; } int SDL::Event::getNumberOfPixelQuintuplets() { - int nPixelQuintuplets; - cudaMemcpyAsync(&nPixelQuintuplets, pixelQuintupletsInGPU->nPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); + alpaka::wait(queue); + + int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); + return nPixelQuintuplets; } @@ -1589,57 +1554,78 @@ unsigned int SDL::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer) } int SDL::Event::getNumberOfTrackCandidates() -{ - int nTrackCandidates; - cudaMemcpyAsync(&nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); +{ + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); + alpaka::wait(queue); + + int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); return nTrackCandidates; } int SDL::Event::getNumberOfPT5TrackCandidates() { - int nTrackCandidatesPT5; - cudaMemcpyAsync(&nTrackCandidatesPT5, trackCandidatesInGPU->nTrackCandidatespT5, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTrackCandidatesPT5_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf, 1); + alpaka::wait(queue); + + int nTrackCandidatesPT5 = *alpaka::getPtrNative(nTrackCandidatesPT5_buf); return nTrackCandidatesPT5; } int SDL::Event::getNumberOfPT3TrackCandidates() { - int nTrackCandidatesPT3; - cudaMemcpyAsync(&nTrackCandidatesPT3, trackCandidatesInGPU->nTrackCandidatespT3, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTrackCandidatesPT3_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf, 1); + alpaka::wait(queue); + + int nTrackCandidatesPT3 = *alpaka::getPtrNative(nTrackCandidatesPT3_buf); return nTrackCandidatesPT3; } int SDL::Event::getNumberOfPLSTrackCandidates() { - unsigned int nTrackCandidatesPLS; - cudaMemcpyAsync(&nTrackCandidatesPLS, trackCandidatesInGPU->nTrackCandidatespLS, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTrackCandidatesPLS_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf, 1); + alpaka::wait(queue); + + unsigned int nTrackCandidatesPLS = *alpaka::getPtrNative(nTrackCandidatesPLS_buf); return nTrackCandidatesPLS; } int SDL::Event::getNumberOfPixelTrackCandidates() { - int nTrackCandidates; - int nTrackCandidatesT5; - cudaMemcpyAsync(&nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(&nTrackCandidatesT5, trackCandidatesInGPU->nTrackCandidatesT5, sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); + alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1); + alpaka::wait(queue); + + int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); + int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); return nTrackCandidates - nTrackCandidatesT5; } int SDL::Event::getNumberOfT5TrackCandidates() { - int nTrackCandidatesT5; - cudaMemcpyAsync(&nTrackCandidatesT5, trackCandidatesInGPU->nTrackCandidatesT5, sizeof(int), cudaMemcpyDeviceToHost,stream); - return nTrackCandidatesT5; + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1); + + alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1); + alpaka::wait(queue); + + int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); + + return nTrackCandidatesT5; } SDL::hitsBuffer* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection diff --git a/SDL/Event.cuh b/SDL/Event.cuh index b512b469..f5a671bf 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -161,6 +161,6 @@ namespace SDL void initModules(const char* moduleMetaDataFilePath="data/centroid.txt"); //read from file and init void cleanModules(); void initModulesHost(); //read from file and init - extern struct pixelMap* pixelMapping; + extern std::unique_ptr pixelMapping; } #endif diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index 8fd7d952..51ff3e95 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -182,15 +182,15 @@ namespace SDL int nMatched = 0; for (int i = 0; i < 6; i++) { - bool matched = false; + bool tmatched = false; for (int j = 0; j < 6; j++) { if(hits1[i] == hits2[j]) { - matched = true; break; + tmatched = true; break; } } - if(matched) + if(tmatched) { nMatched++; } @@ -202,6 +202,7 @@ namespace SDL struct removeDupQuintupletsInGPUAfterBuild { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -272,6 +273,7 @@ namespace SDL struct removeDupQuintupletsInGPUBeforeTC { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -357,6 +359,7 @@ namespace SDL struct removeDupPixelTripletsInGPUFromMap { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -404,6 +407,7 @@ namespace SDL struct removeDupPixelQuintupletsInGPUFromMap { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -448,6 +452,7 @@ namespace SDL struct checkHitspLS { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index a75dcfb0..74897eb3 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -137,45 +137,45 @@ namespace SDL Buf outerLowEdgeY_buf; template - miniDoubletsBuffer(unsigned int nMemoryLocations, + miniDoubletsBuffer(unsigned int nMemoryLoc, uint16_t nLowerModules, TDevAcc const & devAccIn, TQueue& queue) : nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), - anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocations)), + anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), nMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), totOccupancyMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - dzs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - dphis_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerZ_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerRt_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerEta_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLocations)), - outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLocations)) + dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + dzs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + dphis_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerZ_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerRt_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerEta_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), + outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)) { alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1); alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1); @@ -770,6 +770,7 @@ namespace SDL struct createMiniDoubletsInGPUv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -843,6 +844,7 @@ namespace SDL struct createMDArrayRangesGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -912,6 +914,7 @@ namespace SDL struct addMiniDoubletRangesToEventExplicit { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Module.cu b/SDL/Module.cu index 01c0b162..649995ba 100644 --- a/SDL/Module.cu +++ b/SDL/Module.cu @@ -39,7 +39,7 @@ void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned in cudaStreamSynchronize(stream); } -void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping) +void SDL::freeModules(struct modules& modulesInGPU) { cudaFree(modulesInGPU.detIds); cudaFree(modulesInGPU.moduleMap); @@ -65,13 +65,6 @@ void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMappin cudaFree(modulesInGPU.moduleLayerType); cudaFree(modulesInGPU.connectedPixels); cudaFree(modulesInGPU.partnerModuleIndices); - - cudaFreeHost(pixelMapping.connectedPixelsSizes); - cudaFreeHost(pixelMapping.connectedPixelsSizesPos); - cudaFreeHost(pixelMapping.connectedPixelsSizesNeg); - cudaFreeHost(pixelMapping.connectedPixelsIndex); - cudaFreeHost(pixelMapping.connectedPixelsIndexPos); - cudaFreeHost(pixelMapping.connectedPixelsIndexNeg); } void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, uint16_t& nLowerModules, struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath) @@ -130,41 +123,43 @@ void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, nModules = counter; //std::cout<<"Number of modules = "<(devHost, nModules); + auto layers_buf = allocBufWrapper(devHost, nModules); + auto rings_buf = allocBufWrapper(devHost, nModules); + auto rods_buf = allocBufWrapper(devHost, nModules); + auto modules_buf = allocBufWrapper(devHost, nModules); + auto subdets_buf = allocBufWrapper(devHost, nModules); + auto sides_buf = allocBufWrapper(devHost, nModules); + auto eta_buf = allocBufWrapper(devHost, nModules); + auto r_buf = allocBufWrapper(devHost, nModules); + auto isInverted_buf = allocBufWrapper(devHost, nModules); + auto isLower_buf = allocBufWrapper(devHost, nModules); + auto isAnchor_buf = allocBufWrapper(devHost, nModules); + auto moduleType_buf = allocBufWrapper(devHost, nModules); + auto moduleLayerType_buf = allocBufWrapper(devHost, nModules); + auto slopes_buf = allocBufWrapper(devHost, nModules); + auto drdzs_buf = allocBufWrapper(devHost, nModules); + auto partnerModuleIndices_buf = allocBufWrapper(devHost, nModules); + + // Getting the underlying data pointers + unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf); + short* host_layers = alpaka::getPtrNative(layers_buf); + short* host_rings = alpaka::getPtrNative(rings_buf); + short* host_rods = alpaka::getPtrNative(rods_buf); + short* host_modules = alpaka::getPtrNative(modules_buf); + short* host_subdets = alpaka::getPtrNative(subdets_buf); + short* host_sides = alpaka::getPtrNative(sides_buf); + float* host_eta = alpaka::getPtrNative(eta_buf); + float* host_r = alpaka::getPtrNative(r_buf); + bool* host_isInverted = alpaka::getPtrNative(isInverted_buf); + bool* host_isLower = alpaka::getPtrNative(isLower_buf); + bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf); + ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf); + ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf); + float* host_slopes = alpaka::getPtrNative(slopes_buf); + float* host_drdzs = alpaka::getPtrNative(drdzs_buf); + uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf); //reassign detIdToIndex indices here nLowerModules = (nModules - 1) / 2; @@ -304,24 +299,6 @@ void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, cudaMemcpyAsync(modulesInGPU.partnerModuleIndices, host_partnerModuleIndices, sizeof(uint16_t) * nModules, cudaMemcpyHostToDevice, stream); cudaStreamSynchronize(stream); - cms::cuda::free_host(host_detIds); - cms::cuda::free_host(host_layers); - cms::cuda::free_host(host_rings); - cms::cuda::free_host(host_rods); - cms::cuda::free_host(host_modules); - cms::cuda::free_host(host_subdets); - cms::cuda::free_host(host_sides); - cms::cuda::free_host(host_eta); - cms::cuda::free_host(host_r); - cms::cuda::free_host(host_isInverted); - cms::cuda::free_host(host_isLower); - cms::cuda::free_host(host_isAnchor); - cms::cuda::free_host(host_moduleType); - cms::cuda::free_host(host_moduleLayerType); - cms::cuda::free_host(host_slopes); - cms::cuda::free_host(host_drdzs); - cms::cuda::free_host(host_partnerModuleIndices); - fillConnectedModuleArrayExplicit(modulesInGPU,nModules,stream); fillMapArraysExplicit(modulesInGPU, nModules, stream); fillPixelMap(modulesInGPU,pixelMapping,stream); @@ -344,21 +321,14 @@ void SDL::fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nM void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMapping,cudaStream_t stream) { - int size_superbins = 45000; //changed to 45000 to reduce memory useage on GPU std::vector connectedModuleDetIds; std::vector connectedModuleDetIds_pos; std::vector connectedModuleDetIds_neg; - cudaMallocHost(&pixelMapping.connectedPixelsIndex,size_superbins * sizeof(unsigned int)); - cudaMallocHost(&pixelMapping.connectedPixelsSizes,size_superbins * sizeof(unsigned int)); - cudaMallocHost(&pixelMapping.connectedPixelsIndexPos,size_superbins * sizeof(unsigned int)); - cudaMallocHost(&pixelMapping.connectedPixelsSizesPos,size_superbins * sizeof(unsigned int)); - cudaMallocHost(&pixelMapping.connectedPixelsIndexNeg,size_superbins * sizeof(unsigned int)); - cudaMallocHost(&pixelMapping.connectedPixelsSizesNeg,size_superbins * sizeof(unsigned int)); - - int totalSizes=0; - int totalSizes_pos=0; - int totalSizes_neg=0; - for(int isuperbin =0; isuperbin connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values std::vector connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins @@ -375,7 +345,7 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end()); connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end()); - int sizes =0; + int sizes = 0; sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size(); sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size(); sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size(); @@ -402,7 +372,7 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end()); connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end()); - int sizes_pos =0; + int sizes_pos = 0; sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size(); sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size(); sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size(); @@ -446,15 +416,15 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi connectedPixels = (unsigned int*)cms::cuda::allocate_host((totalSizes+totalSizes_pos+totalSizes_neg) * sizeof(unsigned int), stream); cudaMalloc(&modulesInGPU.connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)* sizeof(unsigned int)); - for(int icondet=0; icondet< totalSizes; icondet++) + for(int icondet = 0; icondet < totalSizes; icondet++) { connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]]; } - for(int icondet=0; icondet< totalSizes_pos; icondet++) + for(int icondet = 0; icondet < totalSizes_pos; icondet++) { connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]]; } - for(int icondet=0; icondet< totalSizes_neg; icondet++) + for(int icondet = 0; icondet < totalSizes_neg; icondet++) { connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]]; } diff --git a/SDL/Module.cuh b/SDL/Module.cuh index d4e1457f..3967c764 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -230,8 +230,17 @@ namespace SDL unsigned int* connectedPixelsSizesNeg; }; + // PixelMap is never allocated on the device. + // This is also not passed to any of the kernels, so we can combine the structs. struct pixelMap { + Buf connectedPixelsIndex_buf; + Buf connectedPixelsSizes_buf; + Buf connectedPixelsIndexPos_buf; + Buf connectedPixelsSizesPos_buf; + Buf connectedPixelsIndexNeg_buf; + Buf connectedPixelsSizesNeg_buf; + unsigned int* connectedPixelsIndex; unsigned int* connectedPixelsSizes; unsigned int* connectedPixelsIndexPos; @@ -239,8 +248,23 @@ namespace SDL unsigned int* connectedPixelsIndexNeg; unsigned int* connectedPixelsSizesNeg; - int* superbin; int* pixelType; + + pixelMap(unsigned int sizef = size_superbins) : + connectedPixelsIndex_buf(allocBufWrapper(devHost, sizef)), + connectedPixelsSizes_buf(allocBufWrapper(devHost, sizef)), + connectedPixelsIndexPos_buf(allocBufWrapper(devHost, sizef)), + connectedPixelsSizesPos_buf(allocBufWrapper(devHost, sizef)), + connectedPixelsIndexNeg_buf(allocBufWrapper(devHost, sizef)), + connectedPixelsSizesNeg_buf(allocBufWrapper(devHost, sizef)) + { + connectedPixelsIndex = alpaka::getPtrNative(connectedPixelsIndex_buf); + connectedPixelsSizes = alpaka::getPtrNative(connectedPixelsSizes_buf); + connectedPixelsIndexPos = alpaka::getPtrNative(connectedPixelsIndexPos_buf); + connectedPixelsSizesPos = alpaka::getPtrNative(connectedPixelsSizesPos_buf); + connectedPixelsIndexNeg = alpaka::getPtrNative(connectedPixelsIndexNeg_buf); + connectedPixelsSizesNeg = alpaka::getPtrNative(connectedPixelsSizesNeg_buf); + } }; extern std::map * detIdToIndex; @@ -251,7 +275,7 @@ namespace SDL void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt"); void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream); - void freeModules(struct modules& modulesInGPU,struct pixelMap& pixelMapping); + void freeModules(struct modules& modulesInGPU); void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream); void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index c40f8283..033dea78 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -846,6 +846,7 @@ namespace SDL struct createPixelTripletsInGPUFromMapv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2245,6 +2246,7 @@ namespace SDL struct createPixelQuintupletsInGPUFromMapv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index 3a8aa7e6..6bf87ca0 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -2164,6 +2164,7 @@ namespace SDL struct createQuintupletsInGPUv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2258,6 +2259,7 @@ namespace SDL struct createEligibleModulesListForQuintupletsGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2333,6 +2335,7 @@ namespace SDL struct addQuintupletRangesToEventExplicit { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index d6308cb0..c2f9aef5 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -678,6 +678,7 @@ namespace SDL struct createSegmentsInGPUv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -754,6 +755,7 @@ namespace SDL struct createSegmentArrayRanges { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -831,6 +833,7 @@ namespace SDL struct addSegmentRangesToEventExplicit { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index c11ae247..9abaa754 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -190,6 +190,7 @@ namespace SDL struct crossCleanpT3 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -238,6 +239,7 @@ namespace SDL struct crossCleanT5 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -304,6 +306,7 @@ namespace SDL // This will eliminate the need for another kernel just for adding the pLS, because we can __syncthreads() struct crossCleanpLS { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -390,6 +393,7 @@ namespace SDL struct addpT3asTrackCandidatesInGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -425,6 +429,7 @@ namespace SDL struct addT5asTrackCandidateInGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -462,6 +467,7 @@ namespace SDL struct addpLSasTrackCandidateInGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -491,6 +497,7 @@ namespace SDL struct addpT5asTrackCandidateInGPU { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 6df7b06a..045e2ee2 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -1296,6 +1296,7 @@ namespace SDL struct createTripletsInGPUv2 { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -1371,6 +1372,7 @@ namespace SDL struct createTripletArrayRanges { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -1447,6 +1449,7 @@ namespace SDL struct addTripletRangesToEventExplicit { + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, From 86f750b686548140038aad302886b25c087c5838 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Tue, 13 Jun 2023 10:19:22 -0400 Subject: [PATCH 28/44] move endcap maps to Alpaka temp --- SDL/Constants.cuh | 3 +++ SDL/EndcapGeometry.cu | 44 +++++++++++++++++++----------------------- SDL/EndcapGeometry.cuh | 10 +++++----- SDL/Event.cu | 4 ++-- bin/sdl.cc | 3 --- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index f6adc16a..3f5d0c3e 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -104,6 +104,9 @@ const unsigned int N_MAX_T3T3_TRACK_EXTENSIONS = 40000; const unsigned int size_superbins = 45000; +// Temporary fix for endcap buffer allocation. +const unsigned int endcap_size = 9105; + namespace SDL { //defining the constant host device variables right up here diff --git a/SDL/EndcapGeometry.cu b/SDL/EndcapGeometry.cu index a2bf3d8e..44d31faa 100644 --- a/SDL/EndcapGeometry.cu +++ b/SDL/EndcapGeometry.cu @@ -2,11 +2,15 @@ SDL::EndcapGeometry SDL::endcapGeometry; -SDL::EndcapGeometry::EndcapGeometry() +SDL::EndcapGeometry::EndcapGeometry(unsigned int sizef) : + geoMapDetId_buf(allocBufWrapper(devAcc, sizef)), + geoMapPhi_buf(allocBufWrapper(devAcc, sizef)) { } -SDL::EndcapGeometry::EndcapGeometry(std::string filename) +SDL::EndcapGeometry::EndcapGeometry(std::string filename, unsigned int sizef) : + geoMapDetId_buf(allocBufWrapper(devAcc, sizef)), + geoMapPhi_buf(allocBufWrapper(devAcc, sizef)) { load(filename); } @@ -58,30 +62,23 @@ void SDL::EndcapGeometry::load(std::string filename) centroid_phis_[detid] = cr; centroid_zs_[detid] = cz; } - CreateGeoMapArraysExplicit(); + fillGeoMapArraysExplicit(); } -void SDL::freeEndCapMapMemory() +void SDL::EndcapGeometry::fillGeoMapArraysExplicit() { - cudaFree(SDL::endcapGeometry.geoMapPhi); - cudaFree(SDL::endcapGeometry.geoMapDetId); -} + QueueAcc queue(devAcc); -void SDL::EndcapGeometry::CreateGeoMapArraysExplicit() -{ int phi_size = centroid_phis_.size(); - cudaMalloc(&geoMapPhi, phi_size * sizeof(float)); - cudaMalloc(&geoMapDetId, phi_size * sizeof(unsigned int)); -} -void SDL::EndcapGeometry::fillGeoMapArraysExplicit() -{ - float* mapPhi; - unsigned int* mapDetId; - int phi_size = centroid_phis_.size(); - cudaMallocHost(&mapPhi, phi_size * sizeof(float)); - cudaMallocHost(&mapDetId, phi_size * sizeof(unsigned int)); + // Allocate buffers on host + auto mapPhi_host_buf = allocBufWrapper(devHost, phi_size); + auto mapDetId_host_buf = allocBufWrapper(devHost, phi_size); + + // Access the raw pointers of the buffers + float* mapPhi = alpaka::getPtrNative(mapPhi_host_buf); + unsigned int* mapDetId = alpaka::getPtrNative(mapDetId_host_buf); unsigned int counter = 0; for(auto it = centroid_phis_.begin(); it != centroid_phis_.end(); ++it) @@ -95,11 +92,10 @@ void SDL::EndcapGeometry::fillGeoMapArraysExplicit() nEndCapMap = counter; - cudaMemcpy(geoMapPhi, mapPhi, phi_size*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(geoMapDetId, mapDetId, phi_size*sizeof(unsigned int), cudaMemcpyHostToDevice); - - cudaFreeHost(mapPhi); - cudaFreeHost(mapDetId); + // Copy data from host to device buffers + alpaka::memcpy(queue, geoMapPhi_buf, mapPhi_host_buf, phi_size); + alpaka::memcpy(queue, geoMapDetId_buf, mapDetId_host_buf, phi_size); + alpaka::wait(queue); } float SDL::EndcapGeometry::getAverageR2(unsigned int detid) diff --git a/SDL/EndcapGeometry.cuh b/SDL/EndcapGeometry.cuh index 4ad71b40..f9f33c33 100644 --- a/SDL/EndcapGeometry.cuh +++ b/SDL/EndcapGeometry.cuh @@ -15,7 +15,6 @@ namespace SDL { class EndcapGeometry { - private: std::map avgr2s_; std::map yls_; // lower hits @@ -27,12 +26,13 @@ namespace SDL std::map centroid_zs_; // centroid z public: - unsigned int* geoMapDetId; - float* geoMapPhi; + Buf geoMapDetId_buf; + Buf geoMapPhi_buf; + unsigned int nEndCapMap; - EndcapGeometry(); - EndcapGeometry(std::string filename); + EndcapGeometry(unsigned int sizef = endcap_size); + EndcapGeometry(std::string filename, unsigned int sizef = endcap_size); ~EndcapGeometry(); void load(std::string); diff --git a/SDL/Event.cu b/SDL/Event.cu index 27a96ac7..3c8aa2c1 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -338,8 +338,8 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: TwoS, nModules, SDL::endcapGeometry.nEndCapMap, - SDL::endcapGeometry.geoMapDetId, - SDL::endcapGeometry.geoMapPhi, + alpaka::getPtrNative(SDL::endcapGeometry.geoMapDetId_buf), + alpaka::getPtrNative(SDL::endcapGeometry.geoMapPhi_buf), *modulesInGPU, *hitsInGPU, nHits)); diff --git a/bin/sdl.cc b/bin/sdl.cc index 3088e24b..beb15dc0 100644 --- a/bin/sdl.cc +++ b/bin/sdl.cc @@ -350,7 +350,6 @@ void run_sdl() // Looping input file while (ana.looper.nextEvent()) { - // if (ana.looper.getCurrentEventIndex() ==49) {continue;} if (ana.verbose >= 1) std::cout << "PreLoading event number = " << ana.looper.getCurrentEventIndex() << std::endl; @@ -390,7 +389,6 @@ void run_sdl() cudaStreamCreateWithFlags(&streams[s], cudaStreamNonBlocking); SDL::Event *event = new SDL::Event(streams[s],ana.verbose>=2); - ; //(streams[omp_get_thread_num()]); events.push_back(event); } @@ -509,7 +507,6 @@ void run_sdl() printTimingInformation(timevec, full_elapsed, avg_elapsed); SDL::cleanModules(); - SDL::freeEndCapMapMemory(); if (ana.do_write_ntuple) { From 6f2af678c4544280095af087ea54f797caff0261 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Tue, 13 Jun 2023 21:37:49 -0400 Subject: [PATCH 29/44] move modules to Alpaka memory temp --- SDL/CachingDeviceAllocator.h | 722 ---------------------------------- SDL/CachingHostAllocator.h | 661 ------------------------------- SDL/CachingManagedAllocator.h | 662 ------------------------------- SDL/Constants.cuh | 8 + SDL/Event.cu | 424 +++++++------------- SDL/Event.cuh | 18 +- SDL/Module.cu | 559 +------------------------- SDL/Module.cuh | 688 +++++++++++++++++++++++++++++--- SDL/allocate.cc | 66 ---- SDL/allocate.h | 21 - SDL/cudaCheck.h | 61 --- SDL/deviceCount.h | 18 - SDL/getCachingAllocator.h | 75 ---- bin/sdl.cc | 2 - code/core/AccessHelper.cc | 6 +- code/core/write_sdl_ntuple.cc | 22 +- 16 files changed, 790 insertions(+), 3223 deletions(-) delete mode 100644 SDL/CachingDeviceAllocator.h delete mode 100644 SDL/CachingHostAllocator.h delete mode 100644 SDL/CachingManagedAllocator.h delete mode 100644 SDL/allocate.cc delete mode 100644 SDL/allocate.h delete mode 100644 SDL/cudaCheck.h delete mode 100644 SDL/deviceCount.h delete mode 100644 SDL/getCachingAllocator.h diff --git a/SDL/CachingDeviceAllocator.h b/SDL/CachingDeviceAllocator.h deleted file mode 100644 index 666186f7..00000000 --- a/SDL/CachingDeviceAllocator.h +++ /dev/null @@ -1,722 +0,0 @@ -#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h -#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * Forked to CMSSW by Matti Kortelainen - */ - -/****************************************************************************** - * Simple caching allocator for device memory allocations. The allocator is - * thread-safe and capable of managing device allocations on multiple devices. - ******************************************************************************/ - -#include -#include -#include - -#include -#include -//#include -//#include - -/// CUB namespace -namespace notcub { - - /** - * \addtogroup UtilMgmt - * @{ - */ - - /****************************************************************************** - * CachingDeviceAllocator (host use) - ******************************************************************************/ - - /** - * \brief A simple caching allocator for device memory allocations. - * - * \par Overview - * The allocator is thread-safe and stream-safe and is capable of managing cached - * device allocations on multiple devices. It behaves as follows: - * - * \par - * - Allocations from the allocator are associated with an \p active_stream. Once freed, - * the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for - * reuse within other streams when all prior work submitted to \p active_stream has completed. - * - Allocations are categorized and cached by bin size. A new allocation request of - * a given size will only consider cached allocations within the corresponding bin. - * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused device allocations within - * a larger bin cache are not reused for allocation requests that categorize to - * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest - * bin and are simply freed when they are deallocated instead of being returned - * to a bin-cache. - * - %If the total storage of cached allocations on a given device will exceed - * \p max_cached_bytes, allocations for that device are simply freed when they are - * deallocated instead of being returned to their bin-cache. - * - * \par - * For example, the default-constructed CachingDeviceAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B - * - * \par - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB - * and sets a maximum of 6,291,455 cached bytes per device - * - */ - struct CachingDeviceAllocator { - //--------------------------------------------------------------------- - // Constants - //--------------------------------------------------------------------- - - /// Out-of-bounds bin - static const unsigned int INVALID_BIN = (unsigned int)-1; - - /// Invalid size - static const size_t INVALID_SIZE = (size_t)-1; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Invalid device ordinal - static const int INVALID_DEVICE_ORDINAL = -1; - - //--------------------------------------------------------------------- - // Type definitions and helper types - //--------------------------------------------------------------------- - - /** - * Descriptor for device memory allocations - */ - struct BlockDescriptor { - void *d_ptr; // Device pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer and device) - BlockDescriptor(void *d_ptr1, int device1) - : d_ptr(d_ptr1), bytes(0), bin(INVALID_BIN), device(device1), associated_stream(nullptr), ready_event(nullptr) {} - - // Constructor (suitable for searching maps for a range of suitable blocks, given a device) - BlockDescriptor(int device1) - : d_ptr(nullptr), - bytes(0), - bin(INVALID_BIN), - device(device1), - associated_stream(nullptr), - ready_event(nullptr) {} - - // Comparison functor for comparing device pointers - static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { - if (a.device == b.device) - return (a.d_ptr < b.d_ptr); - else - return (a.device < b.device); - } - - // Comparison functor for comparing allocation sizes - static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { - if (a.device == b.device) - return (a.bytes < b.bytes); - else - return (a.device < b.device); - } - }; - - /// BlockDescriptor comparator function interface - typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; - - /// Set type for cached blocks (ordered by size) - typedef std::multiset CachedBlocks; - - /// Set type for live blocks (ordered by ptr) - typedef std::multiset BusyBlocks; - - /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow(unsigned int base, unsigned int exp) { - unsigned int retval = 1; - while (exp > 0) { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } - - /** - * Round up to the nearest power-of - */ - void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) { - power = 0; - rounded_bytes = 1; - - if (value * base < value) { - // Overflow - power = sizeof(size_t) * 8; - rounded_bytes = size_t(0) - 1; - return; - } - - while (rounded_bytes < value) { - rounded_bytes *= base; - power++; - } - } - - //--------------------------------------------------------------------- - // Fields - //--------------------------------------------------------------------- - - cub::Mutex mutex; /// Mutex for thread-safety - - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration - - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes per device - - const bool - skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout - - GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device - CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse - BusyBlocks live_blocks; /// Set of live device allocations currently in use - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - /** - * \brief Constructor. - */ - CachingDeviceAllocator( - unsigned int bin_growthx, ///< Geometric growth factor for bin-sizes - unsigned int min_binx = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_binx = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytesx = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) - bool skip_cleanupx = - false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debugx = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : bin_growth(bin_growthx), - min_bin(min_binx), - max_bin(max_binx), - min_bin_bytes(IntPow(bin_growthx, min_binx)), - max_bin_bytes(IntPow(bin_growthx, max_binx)), - max_cached_bytes(max_cached_bytesx), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Default constructor. - * - * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes - * - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and - * sets a maximum of 6,291,455 cached bytes per device - */ - CachingDeviceAllocator(bool skip_cleanupx = false, bool debugx = false) - : bin_growth(8), - min_bin(3), - max_bin(7), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. - * - * Changing the ceiling of cached bytes does not cause any allocations (in-use or - * cached-in-reserve) to be freed. See \p FreeAllCached(). - */ - cudaError_t SetMaxCachedBytes(size_t max_cached_bytesx) { - // Lock - mutex.Lock(); - - if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytesx); - - this->max_cached_bytes = max_cached_bytesx; - - // Unlock - mutex.Unlock(); - - return cudaSuccess; - } - - /** - * \brief Provides a suitable allocation of device memory for the given size on the specified device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - int device, ///< [in] Device on which to place the allocation - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation - { - *d_ptr = nullptr; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - device = entrypoint_device; - } - - // Create a block descriptor for the requested allocation - bool found = false; - BlockDescriptor search_key(device); - search_key.associated_stream = active_stream; - NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - - if (search_key.bin > max_bin) { - // Bin is greater than our maximum bin: allocate the request - // exactly and give out-of-bounds bin. It will not be cached - // for reuse when returned. - search_key.bin = INVALID_BIN; - search_key.bytes = bytes; - } else { - // Search for a suitable cached allocation: lock - mutex.Lock(); - - if (search_key.bin < min_bin) { - // Bin is less than minimum bin: round up - search_key.bin = min_bin; - search_key.bytes = min_bin_bytes; - } - - // Iterate through the range of cached blocks on the same device in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && - (block_itr->bin == search_key.bin)) { - // To prevent races with reusing blocks returned by the host but still - // in use by the device, only consider cached blocks that are - // either (from the active stream) or (from an idle stream) - if ((active_stream == block_itr->associated_stream) || - (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - live_blocks.insert(search_key); - - // Remove from free blocks - cached_bytes[device].free -= search_key.bytes; - cached_bytes[device].live += search_key.bytes; - - if (debug) - // CMS: improved debug message - _CubLog( - "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " - "associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)block_itr->associated_stream, - (long long)block_itr->ready_event); - - cached_blocks.erase(block_itr); - - break; - } - block_itr++; - } - - // Done searching: unlock - mutex.Unlock(); - } - - // Allocate the block if necessary - if (!found) { - // Set runtime's current device to specified device (entrypoint may not be set) - if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - } - - // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { - // The allocation attempt failed: free all cached blocks on device and retry - if (debug) - _CubLog( - "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", - device, - (long long)search_key.bytes, - (long long)search_key.associated_stream); - - error = cudaSuccess; // Reset the error we will return - cudaGetLastError(); // Reset CUDART's error - - // Lock - mutex.Lock(); - - // Iterate the range of free blocks on the same device - BlockDescriptor free_key(device); - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); - - while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) { - // No need to worry about synchronization with the device: cudaFree is - // blocking and will synchronize across all kernels executing - // on the current device - - // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes[device].free -= block_itr->bytes; - - if (debug) - _CubLog( - "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " - "(%lld bytes) outstanding.\n", - device, - (long long)block_itr->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); - - cached_blocks.erase(block_itr); - - block_itr++; - } - - // Unlock - mutex.Unlock(); - - // Return under error - if (error) - return error; - - // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) - return error; - } - - // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - - // Insert into live blocks - mutex.Lock(); - live_blocks.insert(search_key); - cached_bytes[device].live += search_key.bytes; - mutex.Unlock(); - - if (debug) - // CMS: improved debug message - _CubLog( - "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event); - - // Attempt to revert back to previous device if necessary - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - } - - // Copy device pointer to output parameter - *d_ptr = search_key.d_ptr; - - if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); - - return error; - } - - /** - * \brief Provides a suitable allocation of device memory for the given size on the current device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation - { - return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); - } - - /** - * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree(int device, void *d_ptr) { - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - //if (device == INVALID_DEVICE_ORDINAL) { - // if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - // return error; - // device = entrypoint_device; - //} - - // Lock - mutex.Lock(); - - // Find corresponding block descriptor - bool recached = false; - BlockDescriptor search_key(d_ptr, device); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - - if (block_itr != live_blocks.end()) { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - cached_bytes[device].live -= search_key.bytes; - - // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold - if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { - // Insert returned allocation into free blocks - recached = true; - cached_blocks.insert(search_key); - cached_bytes[device].free += search_key.bytes; - - if (debug) - // CMS: improved debug message - _CubLog( - "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, - (long long)search_key.bytes, - d_ptr, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); - } - } - - // First set to specified device (entrypoint may not be set) - if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - } - - if (recached) { - // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; - } - - // Unlock - mutex.Unlock(); - - if (!recached) { - // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))){ - return error; - } - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))){ - return error; - } - - if (debug) - // CMS: improved debug message - _CubLog( - "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, - (long long)search_key.bytes, - d_ptr, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); - } - - // Reset device - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); } - - /** - * \brief Frees all cached device allocations on all devices - */ - cudaError_t FreeAllCached() { - cudaError_t error = cudaSuccess; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); - - while (!cached_blocks.empty()) { - // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); - - // Get entry-point device ordinal if necessary - if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - break; - } - - // Set current device ordinal if necessary - if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) - break; - current_device = begin->device; - } - - // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes[current_device].free -= begin->bytes; - - if (debug) - _CubLog( - "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " - "bytes) outstanding.\n", - current_device, - (long long)begin->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes[current_device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[current_device].live); - - cached_blocks.erase(begin); - } - - mutex.Unlock(); - - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Destructor - */ - virtual ~CachingDeviceAllocator() { - if (!skip_cleanup) - FreeAllCached(); - } - }; - - /** @} */ // end group UtilMgmt - -} // namespace notcub - -#endif diff --git a/SDL/CachingHostAllocator.h b/SDL/CachingHostAllocator.h deleted file mode 100644 index c5ad255b..00000000 --- a/SDL/CachingHostAllocator.h +++ /dev/null @@ -1,661 +0,0 @@ -#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h -#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * Modified to cache pinned host allocations by Matti Kortelainen - */ - -/****************************************************************************** - * Simple caching allocator for pinned host memory allocations. The allocator is - * thread-safe. - ******************************************************************************/ - -#include -#include -#include - -#include -#include - -/// CUB namespace -namespace notcub { - - /** - * \addtogroup UtilMgmt - * @{ - */ - - /****************************************************************************** - * CachingHostAllocator (host use) - ******************************************************************************/ - - /** - * \brief A simple caching allocator pinned host memory allocations. - * - * \par Overview - * The allocator is thread-safe. It behaves as follows: - * - * I presume the CUDA stream-safeness is not useful as to read/write - * from/to the pinned host memory one needs to synchronize anyway. The - * difference wrt. device memory is that in the CPU all operations to - * the device memory are scheduled via the CUDA stream, while for the - * host memory one can perform operations directly. - * - * \par - * - Allocations are categorized and cached by bin size. A new allocation request of - * a given size will only consider cached allocations within the corresponding bin. - * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused host allocations within - * a larger bin cache are not reused for allocation requests that categorize to - * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest - * bin and are simply freed when they are deallocated instead of being returned - * to a bin-cache. - * - %If the total storage of cached allocations will exceed - * \p max_cached_bytes, allocations are simply freed when they are - * deallocated instead of being returned to their bin-cache. - * - * \par - * For example, the default-constructed CachingHostAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B - * - * \par - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB - * and sets a maximum of 6,291,455 cached bytes - * - */ - struct CachingHostAllocator { - //--------------------------------------------------------------------- - // Constants - //--------------------------------------------------------------------- - - /// Out-of-bounds bin - static const unsigned int INVALID_BIN = (unsigned int)-1; - - /// Invalid size - static const size_t INVALID_SIZE = (size_t)-1; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Invalid device ordinal - static const int INVALID_DEVICE_ORDINAL = -1; - - //--------------------------------------------------------------------- - // Type definitions and helper types - //--------------------------------------------------------------------- - - /** - * Descriptor for pinned host memory allocations - */ - struct BlockDescriptor { - void *d_ptr; // Host pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer) - BlockDescriptor(void *d_ptrx) - : d_ptr(d_ptrx), - bytes(0), - bin(INVALID_BIN), - device(INVALID_DEVICE_ORDINAL), - associated_stream(nullptr), - ready_event(nullptr) {} - - // Constructor (suitable for searching maps for a range of suitable blocks) - BlockDescriptor() - : d_ptr(nullptr), - bytes(0), - bin(INVALID_BIN), - device(INVALID_DEVICE_ORDINAL), - associated_stream(nullptr), - ready_event(nullptr) {} - - // Comparison functor for comparing host pointers - static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); } - - // Comparison functor for comparing allocation sizes - static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); } - }; - - /// BlockDescriptor comparator function interface - typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; - - /// Set type for cached blocks (ordered by size) - typedef std::multiset CachedBlocks; - - /// Set type for live blocks (ordered by ptr) - typedef std::multiset BusyBlocks; - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow(unsigned int base, unsigned int exp) { - unsigned int retval = 1; - while (exp > 0) { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } - - /** - * Round up to the nearest power-of - */ - void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) { - power = 0; - rounded_bytes = 1; - - if (value * base < value) { - // Overflow - power = sizeof(size_t) * 8; - rounded_bytes = size_t(0) - 1; - return; - } - - while (rounded_bytes < value) { - rounded_bytes *= base; - power++; - } - } - - //--------------------------------------------------------------------- - // Fields - //--------------------------------------------------------------------- - - cub::Mutex mutex; /// Mutex for thread-safety - - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration - - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes - - const bool - skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout - - TotalBytes cached_bytes; /// Aggregate cached bytes - CachedBlocks cached_blocks; /// Set of cached pinned host allocations available for reuse - BusyBlocks live_blocks; /// Set of live pinned host allocations currently in use - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - /** - * \brief Constructor. - */ - CachingHostAllocator( - unsigned int bin_growthx, ///< Geometric growth factor for bin-sizes - unsigned int min_binx = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_binx = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytesx = INVALID_SIZE, ///< Maximum aggregate cached bytes (default is no limit) - bool skip_cleanupx = - false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debugx = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : bin_growth(bin_growthx), - min_bin(min_binx), - max_bin(max_binx), - min_bin_bytes(IntPow(bin_growthx, min_binx)), - max_bin_bytes(IntPow(bin_growthx, max_binx)), - max_cached_bytes(max_cached_bytesx), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Default constructor. - * - * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes - * - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and - * sets a maximum of 6,291,455 cached bytes - */ - CachingHostAllocator(bool skip_cleanupx = false, bool debugx = false) - : bin_growth(8), - min_bin(3), - max_bin(7), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache - * - * Changing the ceiling of cached bytes does not cause any allocations (in-use or - * cached-in-reserve) to be freed. See \p FreeAllCached(). - */ - void SetMaxCachedBytes(size_t max_cached_bytesx) { - // Lock - mutex.Lock(); - - if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytesx); - - this->max_cached_bytes = max_cached_bytesx; - - // Unlock - mutex.Unlock(); - } - - /** - * \brief Provides a suitable allocation of pinned host memory for the given size. - * - * Once freed, the allocation becomes available immediately for reuse. - */ - cudaError_t HostAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation - { - *d_ptr = nullptr; - int device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (CubDebug(error = cudaGetDevice(&device))) - return error; - - // Create a block descriptor for the requested allocation - bool found = false; - BlockDescriptor search_key; - search_key.device = device; - search_key.associated_stream = active_stream; - NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - - if (search_key.bin > max_bin) { - // Bin is greater than our maximum bin: allocate the request - // exactly and give out-of-bounds bin. It will not be cached - // for reuse when returned. - search_key.bin = INVALID_BIN; - search_key.bytes = bytes; - } else { - // Search for a suitable cached allocation: lock - mutex.Lock(); - - if (search_key.bin < min_bin) { - // Bin is less than minimum bin: round up - search_key.bin = min_bin; - search_key.bytes = min_bin_bytes; - } - - // Iterate through the range of cached blocks in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) { - // To prevent races with reusing blocks returned by the host but still - // in use for transfers, only consider cached blocks that are from an idle stream - if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - if (search_key.device != device) { - // If "associated" device changes, need to re-create the event on the right device - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - search_key.device = device; - } - - live_blocks.insert(search_key); - - // Remove from free blocks - cached_bytes.free -= search_key.bytes; - cached_bytes.live += search_key.bytes; - - if (debug) - _CubLog( - "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " - "(previously associated with stream %lld, event %lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)block_itr->associated_stream, - (long long)block_itr->ready_event); - - cached_blocks.erase(block_itr); - - break; - } - block_itr++; - } - - // Done searching: unlock - mutex.Unlock(); - } - - // Allocate the block if necessary - if (!found) { - // Attempt to allocate - // TODO: eventually support allocation flags - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == - cudaErrorMemoryAllocation) { - // The allocation attempt failed: free all cached blocks on device and retry - if (debug) - _CubLog( - "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " - "allocations", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.device); - - error = cudaSuccess; // Reset the error we will return - cudaGetLastError(); // Reset CUDART's error - - // Lock - mutex.Lock(); - - // Iterate the range of free blocks - CachedBlocks::iterator block_itr = cached_blocks.begin(); - - while ((block_itr != cached_blocks.end())) { - // No need to worry about synchronization with the device: cudaFree is - // blocking and will synchronize across all kernels executing - // on the current device - - // Free pinned host memory. - if (CubDebug(error = cudaFreeHost(block_itr->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes.free -= block_itr->bytes; - - if (debug) - _CubLog( - "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " - "bytes) outstanding.\n", - (long long)block_itr->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - cached_blocks.erase(block_itr); - - block_itr++; - } - - // Unlock - mutex.Unlock(); - - // Return under error - if (error) - return error; - - // Try to allocate again - if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault))) - return error; - } - - // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - - // Insert into live blocks - mutex.Lock(); - live_blocks.insert(search_key); - cached_bytes.live += search_key.bytes; - mutex.Unlock(); - - if (debug) - _CubLog( - "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " - "%lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device); - } - - // Copy host pointer to output parameter - *d_ptr = search_key.d_ptr; - - if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - return error; - } - - /** - * \brief Frees a live allocation of pinned host memory, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse. - */ - cudaError_t HostFree(void *d_ptr) { - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - // Lock - mutex.Lock(); - - // Find corresponding block descriptor - bool recached = false; - BlockDescriptor search_key(d_ptr); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr != live_blocks.end()) { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - cached_bytes.live -= search_key.bytes; - - // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold - if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) { - // Insert returned allocation into free blocks - recached = true; - cached_blocks.insert(search_key); - cached_bytes.free += search_key.bytes; - - if (debug) - _CubLog( - "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " - "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - } - } - - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (entrypoint_device != search_key.device) { - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - } - - if (recached) { - // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; - } - - // Unlock - mutex.Unlock(); - - if (!recached) { - // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFreeHost(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - - if (debug) - _CubLog( - "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - } - - // Reset device - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Frees all cached pinned host allocations - */ - cudaError_t FreeAllCached() { - cudaError_t error = cudaSuccess; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); - - while (!cached_blocks.empty()) { - // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); - - // Get entry-point device ordinal if necessary - if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - break; - } - - // Set current device ordinal if necessary - if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) - break; - current_device = begin->device; - } - - // Free host memory - if (CubDebug(error = cudaFreeHost(begin->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes.free -= begin->bytes; - - if (debug) - _CubLog( - "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " - "bytes) outstanding.\n", - (long long)begin->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - cached_blocks.erase(begin); - } - - mutex.Unlock(); - - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Destructor - */ - ~CachingHostAllocator() { - if (!skip_cleanup) - FreeAllCached(); - } - }; - - /** @} */ // end group UtilMgmt - -} // namespace notcub - -#endif diff --git a/SDL/CachingManagedAllocator.h b/SDL/CachingManagedAllocator.h deleted file mode 100644 index 6830be63..00000000 --- a/SDL/CachingManagedAllocator.h +++ /dev/null @@ -1,662 +0,0 @@ -#ifndef HeterogenousCore_CUDAUtilities_src_CachingManagedAllocator_h -#define HeterogenousCore_CUDAUtilities_src_CachingManagedAllocator_h - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * Modified to cache managed memory allocations by Matti Kortelainen - */ - -/****************************************************************************** - * Simple caching allocator for managed memory allocations. The allocator is - * thread-safe. - ******************************************************************************/ - -#include -#include -#include - -#include -#include -//#include -//#include - -/// CUB namespace -namespace notcub { - - /** - * \addtogroup UtilMgmt - * @{ - */ - - /****************************************************************************** - * CachingManagedAllocator (host use) - ******************************************************************************/ - - /** - * \brief A simple caching allocator managed memory allocations. - * - * \par Overview - * The allocator is thread-safe. It behaves as follows: - * - * I presume the CUDA stream-safeness is not useful as to read/write - * from/to the managed memory one needs to synchronize anyway. The - * difference wrt. device memory is that in the CPU all operations to - * the device memory are scheduled via the CUDA stream, while for the - * managed memory one can perform operations directly. - * - * \par - * - Allocations are categorized and cached by bin size. A new allocation request of - * a given size will only consider cached allocations within the corresponding bin. - * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused host allocations within - * a larger bin cache are not reused for allocation requests that categorize to - * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest - * bin and are simply freed when they are deallocated instead of being returned - * to a bin-cache. - * - %If the total storage of cached allocations will exceed - * \p max_cached_bytes, allocations are simply freed when they are - * deallocated instead of being returned to their bin-cache. - * - * \par - * For example, the default-constructed CachingHostAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B - * - * \par - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB - * and sets a maximum of 6,291,455 cached bytes - * - */ - struct CachingManagedAllocator { - //--------------------------------------------------------------------- - // Constants - //--------------------------------------------------------------------- - - /// Out-of-bounds bin - static const unsigned int INVALID_BIN = (unsigned int)-1; - - /// Invalid size - static const size_t INVALID_SIZE = (size_t)-1; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Invalid device ordinal - static const int INVALID_DEVICE_ORDINAL = -1; - - //--------------------------------------------------------------------- - // Type definitions and helper types - //--------------------------------------------------------------------- - - /** - * Descriptor for pinned managed memory allocations - */ - struct BlockDescriptor { - void *d_ptr; // Managed pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer) - BlockDescriptor(void *d_ptrx) - : d_ptr(d_ptrx), - bytes(0), - bin(INVALID_BIN), - device(INVALID_DEVICE_ORDINAL), - associated_stream(nullptr), - ready_event(nullptr) {} - - // Constructor (suitable for searching maps for a range of suitable blocks) - BlockDescriptor() - : d_ptr(nullptr), - bytes(0), - bin(INVALID_BIN), - device(INVALID_DEVICE_ORDINAL), - associated_stream(nullptr), - ready_event(nullptr) {} - - // Comparison functor for comparing managed pointers - static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); } - - // Comparison functor for comparing allocation sizes - static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); } - }; - - /// BlockDescriptor comparator function interface - typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; - - /// Set type for cached blocks (ordered by size) - typedef std::multiset CachedBlocks; - - /// Set type for live blocks (ordered by ptr) - typedef std::multiset BusyBlocks; - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow(unsigned int base, unsigned int exp) { - unsigned int retval = 1; - while (exp > 0) { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } - - /** - * Round up to the nearest power-of - */ - void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) { - power = 0; - rounded_bytes = 1; - - if (value * base < value) { - // Overflow - power = sizeof(size_t) * 8; - rounded_bytes = size_t(0) - 1; - return; - } - - while (rounded_bytes < value) { - rounded_bytes *= base; - power++; - } - } - - //--------------------------------------------------------------------- - // Fields - //--------------------------------------------------------------------- - - cub::Mutex mutex; /// Mutex for thread-safety - - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration - - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes - - const bool - skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout - - TotalBytes cached_bytes; /// Aggregate cached bytes - CachedBlocks cached_blocks; /// Set of cached managed memory allocations available for reuse - BusyBlocks live_blocks; /// Set of live managed memory allocations currently in use - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - /** - * \brief Constructor. - */ - CachingManagedAllocator( - unsigned int bin_growthx, ///< Geometric growth factor for bin-sizes - unsigned int min_binx = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_binx = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytesx = INVALID_SIZE, ///< Maximum aggregate cached bytes (default is no limit) - bool skip_cleanupx = - false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debugx = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : bin_growth(bin_growthx), - min_bin(min_binx), - max_bin(max_binx), - min_bin_bytes(IntPow(bin_growthx, min_binx)), - max_bin_bytes(IntPow(bin_growthx, max_binx)), - max_cached_bytes(max_cached_bytesx), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Default constructor. - * - * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes - * - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and - * sets a maximum of 6,291,455 cached bytes - */ - CachingManagedAllocator(bool skip_cleanupx = false, bool debugx = false) - : bin_growth(8), - min_bin(3), - max_bin(7), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1), - skip_cleanup(skip_cleanupx), - debug(debugx), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) {} - - /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache - * - * Changing the ceiling of cached bytes does not cause any allocations (in-use or - * cached-in-reserve) to be freed. See \p FreeAllCached(). - */ - void SetMaxCachedBytes(size_t max_cached_bytesx) { - // Lock - mutex.Lock(); - - if (debug) - _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", - (long long)this->max_cached_bytes, - (long long)max_cached_bytesx); - - this->max_cached_bytes = max_cached_bytesx; - - // Unlock - mutex.Unlock(); - } - - /** - * \brief Provides a suitable allocation of managed memory for the given size. - * - * Once freed, the allocation becomes available immediately for reuse. - */ - cudaError_t ManagedAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation - { - *d_ptr = nullptr; - int device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (CubDebug(error = cudaGetDevice(&device))) - return error; - - // Create a block descriptor for the requested allocation - bool found = false; - BlockDescriptor search_key; - search_key.device = device; - search_key.associated_stream = active_stream; - NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - - if (search_key.bin > max_bin) { - // Bin is greater than our maximum bin: allocate the request - // exactly and give out-of-bounds bin. It will not be cached - // for reuse when returned. - search_key.bin = INVALID_BIN; - search_key.bytes = bytes; - } else { - // Search for a suitable cached allocation: lock - mutex.Lock(); - - if (search_key.bin < min_bin) { - // Bin is less than minimum bin: round up - search_key.bin = min_bin; - search_key.bytes = min_bin_bytes; - } - - // Iterate through the range of cached blocks in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) { - // To prevent races with reusing blocks returned by the host but still - // in use for transfers, only consider cached blocks that are from an idle stream - if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - if (search_key.device != device) { - // If "associated" device changes, need to re-create the event on the right device - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - if (CubDebug(error = cudaSetDevice(device))) - return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - search_key.device = device; - } - - live_blocks.insert(search_key); - - // Remove from free blocks - cached_bytes.free -= search_key.bytes; - cached_bytes.live += search_key.bytes; - - if (debug) - _CubLog( - "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " - "(previously associated with stream %lld, event %lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)block_itr->associated_stream, - (long long)block_itr->ready_event); - - cached_blocks.erase(block_itr); - - break; - } - block_itr++; - } - - // Done searching: unlock - mutex.Unlock(); - } - - // Allocate the block if necessary - if (!found) { - // Attempt to allocate - // TODO: eventually support allocation flags - if (CubDebug(error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { - // The allocation attempt failed: free all cached blocks on device and retry - if (debug) - _CubLog( - "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " - "allocations", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.device); - - error = cudaSuccess; // Reset the error we will return - cudaGetLastError(); // Reset CUDART's error - - // Lock - mutex.Lock(); - - // Iterate the range of free blocks - CachedBlocks::iterator block_itr = cached_blocks.begin(); - - while ((block_itr != cached_blocks.end())) { - // No need to worry about synchronization with the device: cudaFree is - // blocking and will synchronize across all kernels executing - // on the current device - - // Free managed memory. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes.free -= block_itr->bytes; - - if (debug) - _CubLog( - "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " - "bytes) outstanding.\n", - (long long)block_itr->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - cached_blocks.erase(block_itr); - - block_itr++; - } - - // Unlock - mutex.Unlock(); - - // Return under error - if (error) - return error; - - // Try to allocate again - if (CubDebug(error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes))) - return error; - } - - // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - - // Insert into live blocks - mutex.Lock(); - live_blocks.insert(search_key); - cached_bytes.live += search_key.bytes; - mutex.Unlock(); - - if (debug) - _CubLog( - "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " - "%lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device); - } - - // Copy host pointer to output parameter - *d_ptr = search_key.d_ptr; - - if (debug) - _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - return error; - } - - /** - * \brief Frees a live allocation of managed memory, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse. - */ - cudaError_t ManagedFree(void *d_ptr) { - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - // Lock - mutex.Lock(); - - // Find corresponding block descriptor - bool recached = false; - BlockDescriptor search_key(d_ptr); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr != live_blocks.end()) { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - cached_bytes.live -= search_key.bytes; - - // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold - if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) { - // Insert returned allocation into free blocks - recached = true; - cached_blocks.insert(search_key); - cached_bytes.free += search_key.bytes; - - if (debug) - _CubLog( - "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " - "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - } - } - - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - if (entrypoint_device != search_key.device) { - if (CubDebug(error = cudaSetDevice(search_key.device))) - return error; - } - - if (recached) { - // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) - return error; - } - - // Unlock - mutex.Unlock(); - - if (!recached) { - // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) - return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) - return error; - - if (debug) - _CubLog( - "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - } - - // Reset device - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Frees all cached managed memory allocations - */ - cudaError_t FreeAllCached() { - cudaError_t error = cudaSuccess; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); - - while (!cached_blocks.empty()) { - // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); - - // Get entry-point device ordinal if necessary - if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - break; - } - - // Set current device ordinal if necessary - if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) - break; - current_device = begin->device; - } - - // Free managed memory - if (CubDebug(error = cudaFree(begin->d_ptr))) - break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) - break; - - // Reduce balance and erase entry - cached_bytes.free -= begin->bytes; - - if (debug) - _CubLog( - "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " - "bytes) outstanding.\n", - (long long)begin->bytes, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); - - cached_blocks.erase(begin); - } - - mutex.Unlock(); - - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) - return error; - } - - return error; - } - - /** - * \brief Destructor - */ - ~CachingManagedAllocator() { - if (!skip_cleanup) - FreeAllCached(); - } - }; - - /** @} */ // end group UtilMgmt - -} // namespace notcub - -#endif diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 3f5d0c3e..49412d68 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -4,6 +4,10 @@ #include #include +// CUDA headers. Will be removed soon. +#include +#include + #ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters #define __F2H __float2half #define __H2F __half2float @@ -107,6 +111,10 @@ const unsigned int size_superbins = 45000; // Temporary fix for endcap buffer allocation. const unsigned int endcap_size = 9105; +// Temporary fix for module buffer allocation. +const unsigned int modules_size = 26401; +const unsigned int pix_tot = 1796504; + namespace SDL { //defining the constant host device variables right up here diff --git a/SDL/Event.cu b/SDL/Event.cu index 3c8aa2c1..eab44436 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -1,16 +1,13 @@ #include "Event.cuh" -struct SDL::modules* SDL::modulesInGPU = nullptr; -std::unique_ptr SDL::pixelMapping = std::make_unique(); +std::shared_ptr SDL::modulesInGPU = std::make_shared(); +std::shared_ptr> SDL::modulesBuffers = std::make_shared>(devAcc); +std::shared_ptr SDL::pixelMapping = std::make_shared(); uint16_t SDL::nModules; uint16_t SDL::nLowerModules; SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx(0u)) { - int version; - int driver; - cudaRuntimeGetVersion(&version); - cudaDriverGetVersion(&driver); stream = estream; addObjects = verbose; hitsInGPU = nullptr; @@ -36,7 +33,7 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx pixelQuintupletsInCPU = nullptr; //reset the arrays - for(int i = 0; i<6; i++) + for(int i = 0; i < 6; i++) { n_hits_by_layer_barrel_[i] = 0; n_minidoublets_by_layer_barrel_[i] = 0; @@ -44,7 +41,7 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx n_triplets_by_layer_barrel_[i] = 0; n_trackCandidates_by_layer_barrel_[i] = 0; n_quintuplets_by_layer_barrel_[i] = 0; - if(i<5) + if(i < 5) { n_hits_by_layer_endcap_[i] = 0; n_minidoublets_by_layer_endcap_[i] = 0; @@ -56,99 +53,10 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx } } -SDL::Event::~Event() -{ - if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;} - if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;} - if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;} - if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;} - if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;} - if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;} - if(pixelTripletsInGPU!= nullptr){delete pixelTripletsInGPU; delete pixelTripletsBuffers;} - if(pixelQuintupletsInGPU!= nullptr){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers;} - if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;} - - if(hitsInCPU != nullptr) - { - delete hitsInCPU; - } - if(rangesInCPU != nullptr) - { - delete rangesInCPU; - } - if(mdsInCPU != nullptr) - { - delete mdsInCPU; - } - if(segmentsInCPU != nullptr) - { - delete segmentsInCPU; - } - if(tripletsInCPU != nullptr) - { - delete tripletsInCPU; - } - if(quintupletsInCPU != nullptr) - { - delete quintupletsInCPU; - } - if(pixelTripletsInCPU != nullptr) - { - delete pixelTripletsInCPU; - } - if(pixelQuintupletsInCPU != nullptr) - { - delete pixelQuintupletsInCPU; - } - if(trackCandidatesInCPU != nullptr) - { - delete trackCandidatesInCPU; - } - if(modulesInCPU != nullptr) - { - delete[] modulesInCPU->nLowerModules; - delete[] modulesInCPU->nModules; - delete[] modulesInCPU->detIds; - delete[] modulesInCPU->isLower; - delete[] modulesInCPU->layers; - delete[] modulesInCPU->subdets; - delete[] modulesInCPU->rings; - delete[] modulesInCPU->rods; - delete[] modulesInCPU->modules; - delete[] modulesInCPU->sides; - delete[] modulesInCPU->eta; - delete[] modulesInCPU->r; - delete[] modulesInCPU; - } - if(modulesInCPUFull != nullptr) - { - delete[] modulesInCPUFull->detIds; - delete[] modulesInCPUFull->moduleMap; - delete[] modulesInCPUFull->nConnectedModules; - delete[] modulesInCPUFull->drdzs; - delete[] modulesInCPUFull->slopes; - delete[] modulesInCPUFull->nModules; - delete[] modulesInCPUFull->nLowerModules; - delete[] modulesInCPUFull->layers; - delete[] modulesInCPUFull->rings; - delete[] modulesInCPUFull->modules; - delete[] modulesInCPUFull->rods; - delete[] modulesInCPUFull->subdets; - delete[] modulesInCPUFull->sides; - delete[] modulesInCPUFull->eta; - delete[] modulesInCPUFull->r; - delete[] modulesInCPUFull->isInverted; - delete[] modulesInCPUFull->isLower; - delete[] modulesInCPUFull->moduleType; - delete[] modulesInCPUFull->moduleLayerType; - delete[] modulesInCPUFull; - } -} - void SDL::Event::resetEvent() { //reset the arrays - for(int i = 0; i<6; i++) + for(int i = 0; i < 6; i++) { n_hits_by_layer_barrel_[i] = 0; n_minidoublets_by_layer_barrel_[i] = 0; @@ -156,7 +64,7 @@ void SDL::Event::resetEvent() n_triplets_by_layer_barrel_[i] = 0; n_trackCandidates_by_layer_barrel_[i] = 0; n_quintuplets_by_layer_barrel_[i] = 0; - if(i<5) + if(i < 5) { n_hits_by_layer_endcap_[i] = 0; n_minidoublets_by_layer_endcap_[i] = 0; @@ -232,43 +140,12 @@ void SDL::Event::resetEvent() } if(modulesInCPU != nullptr) { - delete[] modulesInCPU->nLowerModules; - delete[] modulesInCPU->nModules; - delete[] modulesInCPU->detIds; - delete[] modulesInCPU->isLower; - delete[] modulesInCPU->layers; - delete[] modulesInCPU->subdets; - delete[] modulesInCPU->rings; - delete[] modulesInCPU->rods; - delete[] modulesInCPU->modules; - delete[] modulesInCPU->sides; - delete[] modulesInCPU->eta; - delete[] modulesInCPU->r; - delete[] modulesInCPU; + delete modulesInCPU; modulesInCPU = nullptr; } if(modulesInCPUFull != nullptr) { - delete[] modulesInCPUFull->detIds; - delete[] modulesInCPUFull->moduleMap; - delete[] modulesInCPUFull->nConnectedModules; - delete[] modulesInCPUFull->drdzs; - delete[] modulesInCPUFull->slopes; - delete[] modulesInCPUFull->nModules; - delete[] modulesInCPUFull->nLowerModules; - delete[] modulesInCPUFull->layers; - delete[] modulesInCPUFull->rings; - delete[] modulesInCPUFull->modules; - delete[] modulesInCPUFull->rods; - delete[] modulesInCPUFull->sides; - delete[] modulesInCPUFull->subdets; - delete[] modulesInCPUFull->eta; - delete[] modulesInCPUFull->r; - delete[] modulesInCPUFull->isInverted; - delete[] modulesInCPUFull->isLower; - delete[] modulesInCPUFull->moduleType; - delete[] modulesInCPUFull->moduleLayerType; - delete[] modulesInCPUFull; + delete modulesInCPUFull; modulesInCPUFull = nullptr; } } @@ -276,18 +153,20 @@ void SDL::Event::resetEvent() void SDL::initModules(const char* moduleMetaDataFilePath) { cudaStream_t default_stream = 0; - if(modulesInGPU == nullptr) - { - cudaMallocHost(&modulesInGPU, sizeof(struct SDL::modules)); - //nModules gets filled here - loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath); - } -} - -void SDL::cleanModules() -{ - freeModules(*modulesInGPU); - cudaFreeHost(modulesInGPU); + QueueAcc queue(devAcc); + + // Set the relevant data pointers. + modulesInGPU->setData(*modulesBuffers); + + // nModules gets filled here + loadModulesFromFile(modulesInGPU.get(), + modulesBuffers.get(), + nModules, + nLowerModules, + *pixelMapping, + default_stream, + queue, + moduleMetaDataFilePath); } void SDL::Event::addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple) @@ -299,9 +178,6 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: auto nHits_buf = allocBufWrapper(devHost, 1); *alpaka::getPtrNative(nHits_buf) = nHits; - // Get current device for future use. - cudaGetDevice(&dev); - // Initialize space on device/host for next event. if (hitsInGPU == nullptr) { @@ -492,22 +368,23 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st void SDL::Event::addMiniDoubletsToEventExplicit() { - unsigned int* nMDsCPU; - nMDsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream); - cudaMemcpyAsync(nMDsCPU,mdsInGPU->nMDs,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules); - short* module_subdets; - module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream); - cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - short* module_layers; - module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream); - cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - int* module_hitRanges; - module_hitRanges = (int*)cms::cuda::allocate_host(nLowerModules* 2*sizeof(int), stream); - cudaMemcpyAsync(module_hitRanges,hitsInGPU->hitRanges,nLowerModules*2*sizeof(int),cudaMemcpyDeviceToHost,stream); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - cudaStreamSynchronize(stream); + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules*2); + alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2); + + alpaka::wait(queue); + int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf); for(unsigned int i = 0; inSegments,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); + auto nSegmentsCPU_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules); - short* module_subdets; - module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream); - cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - short* module_layers; - module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream); - cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + alpaka::wait(queue); + int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); - cudaStreamSynchronize(stream); for(unsigned int i = 0; i(devHost, nLowerModules); uint16_t *index = alpaka::getPtrNative(index_buf); - + // Allocate device index auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules); - + // Allocate and copy nSegments from device to host auto nSegments_buf = allocBufWrapper(devHost, nLowerModules); alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules); alpaka::wait(queue); int *nSegments = alpaka::getPtrNative(nSegments_buf); - - uint16_t* module_nConnectedModules; - module_nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nLowerModules* sizeof(uint16_t), stream); - cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + + // Allocate and copy module_nConnectedModules from device to host + auto module_nConnectedModules_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nLowerModules); + alpaka::wait(queue); + + uint16_t* module_nConnectedModules = alpaka::getPtrNative(module_nConnectedModules_buf); for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex < nLowerModules; innerLowerModuleIndex++) { @@ -752,8 +624,6 @@ void SDL::Event::createTriplets() alpaka::memcpy(queue, index_gpu_buf, index_buf, nonZeroModules); alpaka::wait(queue); - cms::cuda::free_host(module_nConnectedModules); - Vec const threadsPerBlockCreateTrip(static_cast(1), static_cast(16), static_cast(16)); Vec const blocksPerGridCreateTrip(static_cast(MAX_BLOCKS), static_cast(1), static_cast(1)); WorkDiv const createTripletsInGPUv2_workDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); @@ -1296,20 +1166,24 @@ void SDL::Event::createPixelQuintuplets() void SDL::Event::addQuintupletsToEventExplicit() { - unsigned int* nQuintupletsCPU; - nQuintupletsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream); - cudaMemcpyAsync(nQuintupletsCPU,quintupletsInGPU->nQuintuplets,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - - short* module_subdets; - module_subdets = (short*)cms::cuda::allocate_host(nModules* sizeof(short), stream); - cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - short* module_layers; - module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream); - cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - int* module_quintupletModuleIndices; - module_quintupletModuleIndices = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(int), stream); - cudaMemcpyAsync(module_quintupletModuleIndices, rangesInGPU->quintupletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nQuintupletsCPU_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules); + + auto module_subdets_buf = allocBufWrapper(devHost, nModules); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nModules); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + auto module_quintupletModuleIndices_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules); + + alpaka::wait(queue); + int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + int* module_quintupletModuleIndices = alpaka::getPtrNative(module_quintupletModuleIndices_buf); + for(uint16_t i = 0; inTriplets,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); + auto nTripletsCPU_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf, nLowerModules); - short* module_subdets; - module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream); - cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - short* module_layers; - module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream); - cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + alpaka::wait(queue); + int* nTripletsCPU = alpaka::getPtrNative(nTripletsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); - cudaStreamSynchronize(stream); for(uint16_t i = 0; i* SDL::Event::getTrackCandidatesInCMSS return trackCandidatesInCPU; } -SDL::modules* SDL::Event::getFullModules() +SDL::modulesBuffer* SDL::Event::getFullModules() { if(modulesInCPUFull == nullptr) { - modulesInCPUFull = new SDL::modules; - - modulesInCPUFull->detIds = new unsigned int[nModules]; - modulesInCPUFull->moduleMap = new uint16_t[40*nModules]; - modulesInCPUFull->nConnectedModules = new uint16_t[nModules]; - modulesInCPUFull->drdzs = new float[nModules]; - modulesInCPUFull->slopes = new float[nModules]; - modulesInCPUFull->nModules = new uint16_t[1]; - modulesInCPUFull->nLowerModules = new uint16_t[1]; - modulesInCPUFull->layers = new short[nModules]; - modulesInCPUFull->rings = new short[nModules]; - modulesInCPUFull->modules = new short[nModules]; - modulesInCPUFull->rods = new short[nModules]; - modulesInCPUFull->subdets = new short[nModules]; - modulesInCPUFull->sides = new short[nModules]; - modulesInCPUFull->isInverted = new bool[nModules]; - modulesInCPUFull->isLower = new bool[nModules]; - - modulesInCPUFull->moduleType = new ModuleType[nModules]; - modulesInCPUFull->moduleLayerType = new ModuleLayerType[nModules]; - cudaMemcpyAsync(modulesInCPUFull->detIds,modulesInGPU->detIds,nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->moduleMap,modulesInGPU->moduleMap,40*nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->nConnectedModules,modulesInGPU->nConnectedModules,nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->drdzs,modulesInGPU->drdzs,sizeof(float)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->slopes,modulesInGPU->slopes,sizeof(float)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->nLowerModules,modulesInGPU->nLowerModules,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->layers,modulesInGPU->layers,nModules*sizeof(short),cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->rings,modulesInGPU->rings,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->modules,modulesInGPU->modules,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->rods,modulesInGPU->rods,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->subdets,modulesInGPU->subdets,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->sides,modulesInGPU->sides,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->isInverted,modulesInGPU->isInverted,sizeof(bool)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->isLower,modulesInGPU->isLower,sizeof(bool)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->moduleType,modulesInGPU->moduleType,sizeof(ModuleType)*nModules,cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPUFull->moduleLayerType,modulesInGPU->moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + // The last input here is just a small placeholder for the allocation. + modulesInCPUFull = new SDL::modulesBuffer(devHost, nModules, 1); + modulesInCPUFull->setData(*modulesInCPUFull); + + alpaka::memcpy(queue, modulesInCPUFull->detIds_buf, modulesBuffers->detIds_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->moduleMap_buf, modulesBuffers->moduleMap_buf, 40 * nModules); + alpaka::memcpy(queue, modulesInCPUFull->nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->drdzs_buf, modulesBuffers->drdzs_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->slopes_buf, modulesBuffers->slopes_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->nLowerModules_buf, modulesBuffers->nLowerModules_buf, 1); + alpaka::memcpy(queue, modulesInCPUFull->nModules_buf, modulesBuffers->nModules_buf, 1); + alpaka::memcpy(queue, modulesInCPUFull->layers_buf, modulesBuffers->layers_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->rings_buf, modulesBuffers->rings_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->modules_buf, modulesBuffers->modules_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->rods_buf, modulesBuffers->rods_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->subdets_buf, modulesBuffers->subdets_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->sides_buf, modulesBuffers->sides_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->isInverted_buf, modulesBuffers->isInverted_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->isLower_buf, modulesBuffers->isLower_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->moduleType_buf, modulesBuffers->moduleType_buf, nModules); + alpaka::memcpy(queue, modulesInCPUFull->moduleLayerType_buf, modulesBuffers->moduleLayerType_buf, nModules); + alpaka::wait(queue); } return modulesInCPUFull; } -SDL::modules* SDL::Event::getModules() +SDL::modulesBuffer* SDL::Event::getModules() { if(modulesInCPU == nullptr) { - modulesInCPU = new SDL::modules; - modulesInCPU->nLowerModules = new uint16_t[1]; - modulesInCPU->nModules = new uint16_t[1]; - modulesInCPU->detIds = new unsigned int[nModules]; - modulesInCPU->isLower = new bool[nModules]; - modulesInCPU->layers = new short[nModules]; - modulesInCPU->subdets = new short[nModules]; - modulesInCPU->rings = new short[nModules]; - modulesInCPU->rods = new short[nModules]; - modulesInCPU->modules = new short[nModules]; - modulesInCPU->sides = new short[nModules]; - modulesInCPU->eta = new float[nModules]; - modulesInCPU->r = new float[nModules]; - modulesInCPU->moduleType = new ModuleType[nModules]; - - cudaMemcpyAsync(modulesInCPU->nLowerModules, modulesInGPU->nLowerModules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->nModules, modulesInGPU->nModules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->detIds, modulesInGPU->detIds, nModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->isLower, modulesInGPU->isLower, nModules * sizeof(bool), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->layers, modulesInGPU->layers, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->subdets, modulesInGPU->subdets, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->rings, modulesInGPU->rings, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->rods, modulesInGPU->rods, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->modules, modulesInGPU->modules, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->sides, modulesInGPU->sides, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->eta, modulesInGPU->eta, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->r, modulesInGPU->r, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream); - cudaMemcpyAsync(modulesInCPU->moduleType, modulesInGPU->moduleType, nModules * sizeof(ModuleType), cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); + // The last input here is just a small placeholder for the allocation. + modulesInCPU = new SDL::modulesBuffer(devHost, nModules, 1); + modulesInCPU->setData(*modulesInCPU); + + alpaka::memcpy(queue, modulesInCPU->nLowerModules_buf, modulesBuffers->nLowerModules_buf, 1); + alpaka::memcpy(queue, modulesInCPU->nModules_buf, modulesBuffers->nModules_buf, 1); + alpaka::memcpy(queue, modulesInCPU->detIds_buf, modulesBuffers->detIds_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->isLower_buf, modulesBuffers->isLower_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->layers_buf, modulesBuffers->layers_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->subdets_buf, modulesBuffers->subdets_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->rings_buf, modulesBuffers->rings_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->rods_buf, modulesBuffers->rods_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->modules_buf, modulesBuffers->modules_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->sides_buf, modulesBuffers->sides_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->eta_buf, modulesBuffers->eta_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->r_buf, modulesBuffers->r_buf, nModules); + alpaka::memcpy(queue, modulesInCPU->moduleType_buf, modulesBuffers->moduleType_buf, nModules); + alpaka::wait(queue); } return modulesInCPU; -} +} \ No newline at end of file diff --git a/SDL/Event.cuh b/SDL/Event.cuh index f5a671bf..9b70014f 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -12,8 +12,6 @@ #include "TrackCandidate.cuh" #include "Constants.cuh" -#include "allocate.h" - namespace SDL { class Event @@ -37,7 +35,6 @@ namespace SDL std::array n_quintuplets_by_layer_endcap_; //Device stuff - int dev; int nTotalSegments; struct objectRanges* rangesInGPU; struct objectRangesBuffer* rangesBuffers; @@ -65,8 +62,8 @@ namespace SDL segmentsBuffer* segmentsInCPU; tripletsBuffer* tripletsInCPU; trackCandidatesBuffer* trackCandidatesInCPU; - modules* modulesInCPU; - modules* modulesInCPUFull; + modulesBuffer* modulesInCPU; + modulesBuffer* modulesInCPUFull; quintupletsBuffer* quintupletsInCPU; pixelTripletsBuffer* pixelTripletsInCPU; pixelQuintupletsBuffer* pixelQuintupletsInCPU; @@ -75,7 +72,6 @@ namespace SDL int8_t* pixelTypeCPU; public: Event(cudaStream_t estream,bool verbose); - ~Event(); void resetEvent(); void addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple); //call the appropriate hit function, then increment the counter here @@ -149,18 +145,18 @@ namespace SDL trackCandidatesBuffer* getTrackCandidatesInCMSSW(); pixelTripletsBuffer* getPixelTriplets(); pixelQuintupletsBuffer* getPixelQuintuplets(); - modules* getModules(); - modules* getFullModules(); + modulesBuffer* getModules(); + modulesBuffer* getFullModules(); }; //global stuff - extern struct modules* modulesInGPU; - extern struct modules* modulesInHost; + extern std::shared_ptr modulesInGPU; + extern std::shared_ptr> modulesBuffers; extern uint16_t nModules; extern uint16_t nLowerModules; void initModules(const char* moduleMetaDataFilePath="data/centroid.txt"); //read from file and init void cleanModules(); void initModulesHost(); //read from file and init - extern std::unique_ptr pixelMapping; + extern std::shared_ptr pixelMapping; } #endif diff --git a/SDL/Module.cu b/SDL/Module.cu index 649995ba..259e8b8a 100644 --- a/SDL/Module.cu +++ b/SDL/Module.cu @@ -1,564 +1,9 @@ #include "Module.cuh" +// TODO: Change this to remove it from global scope. std::map *SDL::detIdToIndex; std::map *SDL::module_x; std::map *SDL::module_y; std::map *SDL::module_z; std::map *SDL::module_type; // 23 : Ph2PSP, 24 : Ph2PSS, 25 : Ph2SS -// https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29 - -void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream) -{ - /* modules stucture object will be created in Event.cu*/ - cudaMalloc(&(modulesInGPU.detIds),nModules * sizeof(unsigned int)); - cudaMalloc(&modulesInGPU.moduleMap,nModules * 40 * sizeof(uint16_t)); - cudaMalloc(&modulesInGPU.mapIdx, nModules*sizeof(uint16_t)); - cudaMalloc(&modulesInGPU.mapdetId, nModules*sizeof(unsigned int)); - cudaMalloc(&modulesInGPU.nConnectedModules,nModules * sizeof(uint16_t)); - cudaMalloc(&modulesInGPU.drdzs,nModules * sizeof(float)); - cudaMalloc(&modulesInGPU.slopes,nModules * sizeof(float)); - cudaMalloc(&modulesInGPU.nModules,sizeof(uint16_t)); - cudaMalloc(&modulesInGPU.nLowerModules,sizeof(uint16_t)); - cudaMalloc(&modulesInGPU.partnerModuleIndices, nModules * sizeof(uint16_t)); - - cudaMalloc(&modulesInGPU.layers,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.rings,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.modules,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.rods,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.subdets,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.sides,nModules * sizeof(short)); - cudaMalloc(&modulesInGPU.eta,nModules * sizeof(float)); - cudaMalloc(&modulesInGPU.r,nModules * sizeof(float)); - cudaMalloc(&modulesInGPU.isInverted, nModules * sizeof(bool)); - cudaMalloc(&modulesInGPU.isLower, nModules * sizeof(bool)); - cudaMalloc(&modulesInGPU.isAnchor, nModules * sizeof(bool)); - cudaMalloc(&modulesInGPU.moduleType,nModules * sizeof(ModuleType)); - cudaMalloc(&modulesInGPU.moduleLayerType,nModules * sizeof(ModuleLayerType)); - - cudaMemcpyAsync(modulesInGPU.nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); -} - -void SDL::freeModules(struct modules& modulesInGPU) -{ - cudaFree(modulesInGPU.detIds); - cudaFree(modulesInGPU.moduleMap); - cudaFree(modulesInGPU.mapIdx); - cudaFree(modulesInGPU.mapdetId); - cudaFree(modulesInGPU.nConnectedModules); - cudaFree(modulesInGPU.drdzs); - cudaFree(modulesInGPU.slopes); - cudaFree(modulesInGPU.nModules); - cudaFree(modulesInGPU.nLowerModules); - cudaFree(modulesInGPU.layers); - cudaFree(modulesInGPU.rings); - cudaFree(modulesInGPU.modules); - cudaFree(modulesInGPU.rods); - cudaFree(modulesInGPU.subdets); - cudaFree(modulesInGPU.sides); - cudaFree(modulesInGPU.eta); - cudaFree(modulesInGPU.r); - cudaFree(modulesInGPU.isInverted); - cudaFree(modulesInGPU.isLower); - cudaFree(modulesInGPU.isAnchor); - cudaFree(modulesInGPU.moduleType); - cudaFree(modulesInGPU.moduleLayerType); - cudaFree(modulesInGPU.connectedPixels); - cudaFree(modulesInGPU.partnerModuleIndices); -} - -void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, uint16_t& nLowerModules, struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath) -{ - detIdToIndex = new std::map; - module_x = new std::map; - module_y = new std::map; - module_z = new std::map; - module_type = new std::map; - - /*modules structure object will be created in Event.cu*/ - /* Load the whole text file into the map first*/ - - std::ifstream ifile; - ifile.open(moduleMetaDataFilePath); - if(!ifile.is_open()) - { - std::cout<<"ERROR! module list file not present!"<4) - break; - } - - } - (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list - counter++; - nModules = counter; - //std::cout<<"Number of modules = "<(devHost, nModules); - auto layers_buf = allocBufWrapper(devHost, nModules); - auto rings_buf = allocBufWrapper(devHost, nModules); - auto rods_buf = allocBufWrapper(devHost, nModules); - auto modules_buf = allocBufWrapper(devHost, nModules); - auto subdets_buf = allocBufWrapper(devHost, nModules); - auto sides_buf = allocBufWrapper(devHost, nModules); - auto eta_buf = allocBufWrapper(devHost, nModules); - auto r_buf = allocBufWrapper(devHost, nModules); - auto isInverted_buf = allocBufWrapper(devHost, nModules); - auto isLower_buf = allocBufWrapper(devHost, nModules); - auto isAnchor_buf = allocBufWrapper(devHost, nModules); - auto moduleType_buf = allocBufWrapper(devHost, nModules); - auto moduleLayerType_buf = allocBufWrapper(devHost, nModules); - auto slopes_buf = allocBufWrapper(devHost, nModules); - auto drdzs_buf = allocBufWrapper(devHost, nModules); - auto partnerModuleIndices_buf = allocBufWrapper(devHost, nModules); - - // Getting the underlying data pointers - unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf); - short* host_layers = alpaka::getPtrNative(layers_buf); - short* host_rings = alpaka::getPtrNative(rings_buf); - short* host_rods = alpaka::getPtrNative(rods_buf); - short* host_modules = alpaka::getPtrNative(modules_buf); - short* host_subdets = alpaka::getPtrNative(subdets_buf); - short* host_sides = alpaka::getPtrNative(sides_buf); - float* host_eta = alpaka::getPtrNative(eta_buf); - float* host_r = alpaka::getPtrNative(r_buf); - bool* host_isInverted = alpaka::getPtrNative(isInverted_buf); - bool* host_isLower = alpaka::getPtrNative(isLower_buf); - bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf); - ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf); - ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf); - float* host_slopes = alpaka::getPtrNative(slopes_buf); - float* host_drdzs = alpaka::getPtrNative(drdzs_buf); - uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf); - - //reassign detIdToIndex indices here - nLowerModules = (nModules - 1) / 2; - uint16_t lowerModuleCounter = 0; - uint16_t upperModuleCounter = nLowerModules + 1; - //0 to nLowerModules - 1 => only lower modules, nLowerModules - pixel module, nLowerModules + 1 to nModules => upper modules - for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++) - { - unsigned int detId = it->first; - float m_x = (*module_x)[detId]; - float m_y = (*module_y)[detId]; - float m_z = (*module_z)[detId]; - unsigned int m_t = (*module_type)[detId]; - - float eta,r; - - uint16_t index; - unsigned short layer,ring,rod,module,subdet,side; - bool isInverted, isLower; - if(detId == 1) - { - layer = 0; - ring = 0; - rod = 0; - module = 0; - subdet = 0; - side = 0; - isInverted = false; - isLower = false; - } - else - { - setDerivedQuantities(detId,layer,ring,rod,module,subdet,side,m_x,m_y,m_z,eta,r); - isInverted = modulesInGPU.parseIsInverted(subdet, side, module, layer); - isLower = modulesInGPU.parseIsLower(isInverted, detId); - } - if(isLower) - { - index = lowerModuleCounter; - lowerModuleCounter++; - } - else if(detId != 1) - { - index = upperModuleCounter; - upperModuleCounter++; - } - else - { - index = nLowerModules; //pixel - } - //reassigning indices! - (*detIdToIndex)[detId] = index; - host_detIds[index] = detId; - host_layers[index] = layer; - host_rings[index] = ring; - host_rods[index] = rod; - host_modules[index] = module; - host_subdets[index] = subdet; - host_sides[index] = side; - host_eta[index] = eta; - host_r[index] = r; - host_isInverted[index] = isInverted; - host_isLower[index] = isLower; - - //assigning other variables! - if(detId == 1) - { - host_moduleType[index] = PixelModule; - host_moduleLayerType[index] = SDL::InnerPixelLayer; - host_slopes[index] = 0; - host_drdzs[index] = 0; - host_isAnchor[index] = false; - } - else - { - host_moduleType[index] = ( m_t == 25 ? SDL::TwoS : SDL::PS ); - host_moduleLayerType[index] = ( m_t == 23 ? SDL::Pixel : SDL::Strip ); - - if(host_moduleType[index] == SDL::PS and host_moduleLayerType[index] == SDL::Pixel) - { - host_isAnchor[index] = true; - } - else if(host_moduleType[index] == SDL::TwoS and host_isLower[index]) - { - host_isAnchor[index] = true; - } - else - { - host_isAnchor[index] = false; - } - - host_slopes[index] = (subdet == Endcap) ? endcapGeometry.getSlopeLower(detId) : tiltedGeometry.getSlope(detId); - host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry.getDrDz(detId) : 0; - } - } - - //partner module stuff, and slopes and drdz move around - for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++) - { - auto& detId = it->first; - auto& index = it->second; - if(detId != 1) - { - host_partnerModuleIndices[index] = (*detIdToIndex)[modulesInGPU.parsePartnerModuleId(detId, host_isLower[index], host_isInverted[index])]; - //add drdz and slope importing stuff here! - if(host_drdzs[index] == 0) - { - host_drdzs[index] = host_drdzs[host_partnerModuleIndices[index]]; - } - if(host_slopes[index] == 0) - { - host_slopes[index] = host_slopes[host_partnerModuleIndices[index]]; - } - } - } - - cudaMemcpyAsync(modulesInGPU.nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.detIds,host_detIds,nModules*sizeof(unsigned int),cudaMemcpyHostToDevice,stream); - - cudaMemcpyAsync(modulesInGPU.layers,host_layers,nModules*sizeof(short),cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.rings,host_rings,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.rods,host_rods,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.modules,host_modules,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.subdets,host_subdets,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.sides,host_sides,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.eta,host_eta,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.r,host_r,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.isInverted,host_isInverted,sizeof(bool)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.isLower,host_isLower,sizeof(bool)*nModules,cudaMemcpyHostToDevice,stream); - - cudaMemcpyAsync(modulesInGPU.moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.slopes,host_slopes,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.isAnchor, host_isAnchor, sizeof(bool) * nModules, cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(modulesInGPU.drdzs,host_drdzs,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream); - - cudaMemcpyAsync(modulesInGPU.partnerModuleIndices, host_partnerModuleIndices, sizeof(uint16_t) * nModules, cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); - - fillConnectedModuleArrayExplicit(modulesInGPU,nModules,stream); - fillMapArraysExplicit(modulesInGPU, nModules, stream); - fillPixelMap(modulesInGPU,pixelMapping,stream); -} - -void SDL::fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules) -{ - for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it) - { - unsigned int detId = it->first; - uint16_t index = it->second; - auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId); - modulesInGPU.nConnectedModules[index] = connectedModules.size(); - for(uint16_t i = 0; i< modulesInGPU.nConnectedModules[index];i++) - { - modulesInGPU.moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]]; - } - } -} - -void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMapping,cudaStream_t stream) -{ - std::vector connectedModuleDetIds; - std::vector connectedModuleDetIds_pos; - std::vector connectedModuleDetIds_neg; - - int totalSizes = 0; - int totalSizes_pos = 0; - int totalSizes_neg = 0; - for(unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++) - { - std::vector connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values - std::vector connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins - std::vector connectedModuleDetIds_pLStoLayer3Subdet5 = SDL::moduleConnectionMap_pLStoLayer3Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins); - std::vector connectedModuleDetIds_pLStoLayer1Subdet4 = SDL::moduleConnectionMap_pLStoLayer1Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); - std::vector connectedModuleDetIds_pLStoLayer2Subdet4 = SDL::moduleConnectionMap_pLStoLayer2Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); - std::vector connectedModuleDetIds_pLStoLayer3Subdet4 = SDL::moduleConnectionMap_pLStoLayer3Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); - std::vector connectedModuleDetIds_pLStoLayer4Subdet4 = SDL::moduleConnectionMap_pLStoLayer4Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet5.begin(),connectedModuleDetIds_pLStoLayer1Subdet5.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet5.begin(),connectedModuleDetIds_pLStoLayer2Subdet5.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet5.begin(),connectedModuleDetIds_pLStoLayer3Subdet5.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet4.begin(),connectedModuleDetIds_pLStoLayer1Subdet4.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet4.begin(),connectedModuleDetIds_pLStoLayer2Subdet4.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end()); - connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end()); - - int sizes = 0; - sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size(); - sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size(); - sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size(); - sizes += connectedModuleDetIds_pLStoLayer1Subdet4.size(); - sizes += connectedModuleDetIds_pLStoLayer2Subdet4.size(); - sizes += connectedModuleDetIds_pLStoLayer3Subdet4.size(); - sizes += connectedModuleDetIds_pLStoLayer4Subdet4.size(); - pixelMapping.connectedPixelsIndex[isuperbin] = totalSizes; - pixelMapping.connectedPixelsSizes[isuperbin] = sizes; - totalSizes += sizes; - - std::vector connectedModuleDetIds_pLStoLayer1Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet5_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer2Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet5_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer3Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet5_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer1Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet4_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer2Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet4_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer3Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet4_pos.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer4Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer4Subdet4_pos.getConnectedModuleDetIds(isuperbin); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end()); - connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end()); - - int sizes_pos = 0; - sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet4_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet4_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet4_pos.size(); - sizes_pos += connectedModuleDetIds_pLStoLayer4Subdet4_pos.size(); - pixelMapping.connectedPixelsIndexPos[isuperbin] = totalSizes_pos; - pixelMapping.connectedPixelsSizesPos[isuperbin] = sizes_pos; - totalSizes_pos += sizes_pos; - - std::vector connectedModuleDetIds_pLStoLayer1Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet5_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer2Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet5_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer3Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet5_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer1Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet4_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer2Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet4_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer3Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet4_neg.getConnectedModuleDetIds(isuperbin); - std::vector connectedModuleDetIds_pLStoLayer4Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer4Subdet4_neg.getConnectedModuleDetIds(isuperbin); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.end()); - connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.end()); - - int sizes_neg = 0; - sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet5_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet5_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet5_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet4_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet4_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet4_neg.size(); - sizes_neg += connectedModuleDetIds_pLStoLayer4Subdet4_neg.size(); - pixelMapping.connectedPixelsIndexNeg[isuperbin] = totalSizes_neg; - pixelMapping.connectedPixelsSizesNeg[isuperbin] = sizes_neg; - totalSizes_neg += sizes_neg; - } - - unsigned int* connectedPixels; - connectedPixels = (unsigned int*)cms::cuda::allocate_host((totalSizes+totalSizes_pos+totalSizes_neg) * sizeof(unsigned int), stream); - cudaMalloc(&modulesInGPU.connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)* sizeof(unsigned int)); - - for(int icondet = 0; icondet < totalSizes; icondet++) - { - connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]]; - } - for(int icondet = 0; icondet < totalSizes_pos; icondet++) - { - connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]]; - } - for(int icondet = 0; icondet < totalSizes_neg; icondet++) - { - connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]]; - } - cudaMemcpyAsync(modulesInGPU.connectedPixels,connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)*sizeof(unsigned int),cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); - - cms::cuda::free_host(connectedPixels); -} - -void SDL::fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream) -{ - uint16_t* moduleMap; - uint16_t* nConnectedModules; - moduleMap = (uint16_t*)cms::cuda::allocate_host(nModules * 40 * sizeof(uint16_t), stream); - nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nModules * sizeof(uint16_t), stream); - for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it) - { - unsigned int detId = it->first; - uint16_t index = it->second; - auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId); - nConnectedModules[index] = connectedModules.size(); - for(uint16_t i = 0; i< nConnectedModules[index];i++) - { - moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]]; - } - } - cudaMemcpyAsync(modulesInGPU.moduleMap,moduleMap,nModules*40*sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.nConnectedModules,nConnectedModules,nModules*sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); - cms::cuda::free_host(moduleMap); - cms::cuda::free_host(nConnectedModules); -} - -void SDL::fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream) -{ - uint16_t* mapIdx; - unsigned int* mapdetId; - unsigned int counter = 0; - mapIdx = (uint16_t*)cms::cuda::allocate_host(nModules * sizeof(uint16_t), stream); - mapdetId = (unsigned int*)cms::cuda::allocate_host(nModules * sizeof(unsigned int), stream); - for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it) - { - unsigned int detId = it->first; - unsigned int index = it->second; - mapIdx[counter] = index; - mapdetId[counter] = detId; - counter++; - } - cudaMemcpyAsync(modulesInGPU.mapIdx,mapIdx,nModules*sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU.mapdetId,mapdetId,nModules*sizeof(unsigned int),cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); - cms::cuda::free_host(mapIdx); - cms::cuda::free_host(mapdetId); -} - -void SDL::setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r) -{ - subdet = (detId & (7 << 25)) >> 25; - side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18; - layer = (subdet == Endcap) ? (detId & (7 << 18)) >> 18 : (detId & (7 << 20)) >> 20; - ring = (subdet == Endcap) ? (detId & (15 << 12)) >> 12 : 0; - module = (detId & (127 << 2)) >> 2; - rod = (subdet == Endcap) ? 0 : (detId & (127 << 10)) >> 10; - - r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z); - eta = ((m_z > 0) - ( m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y)); -} - -bool SDL::modules::parseIsInverted(short subdet, short side, short module, short layer) -{ - if (subdet == Endcap) - { - if (side == NegZ) - { - return module % 2 == 1; - } - else if (side == PosZ) - { - return module % 2 == 0; - } - else - { - return 0; - } - } - else if (subdet == Barrel) - { - if (side == Center) - { - if (layer <= 3) - { - return module % 2 == 1; - } - else if (layer >= 4) - { - return module % 2 == 0; - } - else - { - return 0; - } - } - else if (side == NegZ or side == PosZ) - { - if (layer <= 2) - { - return module % 2 == 1; - } - else if (layer == 3) - { - return module % 2 == 0; - } - else - { - return 0; - } - } - else - { - return 0; - } - } - else - { - return 0; - } -} - -bool SDL::modules::parseIsLower(bool isInvertedx, unsigned int detId) -{ - return (isInvertedx) ? !(detId & 1) : (detId & 1); -} - -unsigned int SDL::modules::parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx) -{ - return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1); -} +// https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29 \ No newline at end of file diff --git a/SDL/Module.cuh b/SDL/Module.cuh index 3967c764..1015031c 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -8,7 +8,6 @@ #include "TiltedGeometry.h" #include "EndcapGeometry.cuh" #include "ModuleConnectionMap.h" -#include "allocate.h" namespace SDL { @@ -40,6 +39,12 @@ namespace SDL InnerPixelLayer }; + extern std::map * detIdToIndex; + extern std::map *module_x; + extern std::map *module_y; + extern std::map *module_z; + extern std::map *module_type; + struct objectRanges { int* hitRanges; @@ -141,48 +146,48 @@ namespace SDL Buf device_nTotalQuints_buf; template - objectRangesBuffer(unsigned int nModules, - unsigned int nLowerModules, + objectRangesBuffer(unsigned int nMod, + unsigned int nLowerMod, TDevAcc const & devAccIn, TQueue& queue) : - hitRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - hitRangesLower_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules)), - mdRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - segmentRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - trackletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - tripletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - trackCandidateRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - quintupletRanges_buf(allocBufWrapper(devAccIn, nModules*2)), + hitRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nMod)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nMod)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nMod)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nMod)), + mdRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + segmentRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + trackletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + tripletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + trackCandidateRanges_buf(allocBufWrapper(devAccIn, nMod*2)), + quintupletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), nEligibleT5Modules_buf(allocBufWrapper(devAccIn, 1)), - indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerModules)), - quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules)), - quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules)), - miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerModules)), - tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerModules)), + indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerMod)), + quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod)), + quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod)), + miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1)), + miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1)), + segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1)), + segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1)), + tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod)), + tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod)), device_nTotalMDs_buf(allocBufWrapper(devAccIn, 1)), device_nTotalSegs_buf(allocBufWrapper(devAccIn, 1)), device_nTotalTrips_buf(allocBufWrapper(devAccIn, 1)), device_nTotalQuints_buf(allocBufWrapper(devAccIn, 1)) { - alpaka::memset(queue, hitRanges_buf, -1, nModules*2); - alpaka::memset(queue, hitRangesLower_buf, -1, nModules); - alpaka::memset(queue, hitRangesUpper_buf, -1, nModules); - alpaka::memset(queue, hitRangesnLower_buf, -1, nModules); - alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules); - alpaka::memset(queue, mdRanges_buf, -1, nModules*2); - alpaka::memset(queue, segmentRanges_buf, -1, nModules*2); - alpaka::memset(queue, trackletRanges_buf, -1, nModules*2); - alpaka::memset(queue, tripletRanges_buf, -1, nModules*2); - alpaka::memset(queue, trackCandidateRanges_buf, -1, nModules*2); - alpaka::memset(queue, quintupletRanges_buf, -1, nModules*2); - alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerModules); + alpaka::memset(queue, hitRanges_buf, -1, nMod*2); + alpaka::memset(queue, hitRangesLower_buf, -1, nMod); + alpaka::memset(queue, hitRangesUpper_buf, -1, nMod); + alpaka::memset(queue, hitRangesnLower_buf, -1, nMod); + alpaka::memset(queue, hitRangesnUpper_buf, -1, nMod); + alpaka::memset(queue, mdRanges_buf, -1, nMod*2); + alpaka::memset(queue, segmentRanges_buf, -1, nMod*2); + alpaka::memset(queue, trackletRanges_buf, -1, nMod*2); + alpaka::memset(queue, tripletRanges_buf, -1, nMod*2); + alpaka::memset(queue, trackCandidateRanges_buf, -1, nMod*2); + alpaka::memset(queue, quintupletRanges_buf, -1, nMod*2); + alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerMod); alpaka::wait(queue); } }; @@ -214,20 +219,171 @@ namespace SDL ModuleType* moduleType; ModuleLayerType* moduleLayerType; - unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx); + unsigned int* connectedPixels; - bool parseIsInverted(short subdet, short side, short module, short layer); - bool parseIsLower(bool isInvertedx,unsigned int detId); + bool parseIsInverted(short subdet, short side, short module, short layer) + { + if (subdet == Endcap) + { + if (side == NegZ) + { + return module % 2 == 1; + } + else if (side == PosZ) + { + return module % 2 == 0; + } + else + { + return 0; + } + } + else if (subdet == Barrel) + { + if (side == Center) + { + if (layer <= 3) + { + return module % 2 == 1; + } + else if (layer >= 4) + { + return module % 2 == 0; + } + else + { + return 0; + } + } + else if (side == NegZ or side == PosZ) + { + if (layer <= 2) + { + return module % 2 == 1; + } + else if (layer == 3) + { + return module % 2 == 0; + } + else + { + return 0; + } + } + else + { + return 0; + } + } + else + { + return 0; + } + }; - unsigned int* connectedPixels; - unsigned int* connectedPixelsIndex; - unsigned int* connectedPixelsSizes; - unsigned int* connectedPixelsPos; - unsigned int* connectedPixelsIndexPos; - unsigned int* connectedPixelsSizesPos; - unsigned int* connectedPixelsNeg; - unsigned int* connectedPixelsIndexNeg; - unsigned int* connectedPixelsSizesNeg; + bool parseIsLower(bool isInvertedx, unsigned int detId) + { + return (isInvertedx) ? !(detId & 1) : (detId & 1); + }; + + unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx) + { + return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1); + }; + + template + void setData(TBuff& modulesbuf) + { + detIds = alpaka::getPtrNative(modulesbuf.detIds_buf); + moduleMap = alpaka::getPtrNative(modulesbuf.moduleMap_buf); + mapdetId = alpaka::getPtrNative(modulesbuf.mapdetId_buf); + mapIdx = alpaka::getPtrNative(modulesbuf.mapIdx_buf); + nConnectedModules = alpaka::getPtrNative(modulesbuf.nConnectedModules_buf); + drdzs = alpaka::getPtrNative(modulesbuf.drdzs_buf); + slopes = alpaka::getPtrNative(modulesbuf.slopes_buf); + nModules = alpaka::getPtrNative(modulesbuf.nModules_buf); + nLowerModules = alpaka::getPtrNative(modulesbuf.nLowerModules_buf); + partnerModuleIndices = alpaka::getPtrNative(modulesbuf.partnerModuleIndices_buf); + + layers = alpaka::getPtrNative(modulesbuf.layers_buf); + rings = alpaka::getPtrNative(modulesbuf.rings_buf); + modules = alpaka::getPtrNative(modulesbuf.modules_buf); + rods = alpaka::getPtrNative(modulesbuf.rods_buf); + subdets = alpaka::getPtrNative(modulesbuf.subdets_buf); + sides = alpaka::getPtrNative(modulesbuf.sides_buf); + eta = alpaka::getPtrNative(modulesbuf.eta_buf); + r = alpaka::getPtrNative(modulesbuf.r_buf); + isInverted = alpaka::getPtrNative(modulesbuf.isInverted_buf); + isLower = alpaka::getPtrNative(modulesbuf.isLower_buf); + isAnchor = alpaka::getPtrNative(modulesbuf.isAnchor_buf); + moduleType = alpaka::getPtrNative(modulesbuf.moduleType_buf); + moduleLayerType = alpaka::getPtrNative(modulesbuf.moduleLayerType_buf); + + connectedPixels = alpaka::getPtrNative(modulesbuf.connectedPixels_buf); + } + }; + + template + struct modulesBuffer : modules + { + Buf detIds_buf; + Buf moduleMap_buf; + Buf mapdetId_buf; + Buf mapIdx_buf; + Buf nConnectedModules_buf; + Buf drdzs_buf; + Buf slopes_buf; + Buf nModules_buf; + Buf nLowerModules_buf; + Buf partnerModuleIndices_buf; + + Buf layers_buf; + Buf rings_buf; + Buf modules_buf; + Buf rods_buf; + Buf subdets_buf; + Buf sides_buf; + Buf eta_buf; + Buf r_buf; + Buf isInverted_buf; + Buf isLower_buf; + Buf isAnchor_buf; + Buf moduleType_buf; + Buf moduleLayerType_buf; + + Buf connectedPixels_buf; + + template + modulesBuffer(TDevAcc const & devAccIn, + unsigned int nMod = modules_size, + unsigned int nPixs = pix_tot) : + detIds_buf(allocBufWrapper(devAccIn, nMod)), + moduleMap_buf(allocBufWrapper(devAccIn, nMod * 40)), + mapdetId_buf(allocBufWrapper(devAccIn, nMod)), + mapIdx_buf(allocBufWrapper(devAccIn, nMod)), + nConnectedModules_buf(allocBufWrapper(devAccIn, nMod)), + drdzs_buf(allocBufWrapper(devAccIn, nMod)), + slopes_buf(allocBufWrapper(devAccIn, nMod)), + nModules_buf(allocBufWrapper(devAccIn, 1)), + nLowerModules_buf(allocBufWrapper(devAccIn, 1)), + partnerModuleIndices_buf(allocBufWrapper(devAccIn, nMod)), + + layers_buf(allocBufWrapper(devAccIn, nMod)), + rings_buf(allocBufWrapper(devAccIn, nMod)), + modules_buf(allocBufWrapper(devAccIn, nMod)), + rods_buf(allocBufWrapper(devAccIn, nMod)), + subdets_buf(allocBufWrapper(devAccIn, nMod)), + sides_buf(allocBufWrapper(devAccIn, nMod)), + eta_buf(allocBufWrapper(devAccIn, nMod)), + r_buf(allocBufWrapper(devAccIn, nMod)), + isInverted_buf(allocBufWrapper(devAccIn, nMod)), + isLower_buf(allocBufWrapper(devAccIn, nMod)), + isAnchor_buf(allocBufWrapper(devAccIn, nMod)), + moduleType_buf(allocBufWrapper(devAccIn, nMod)), + moduleLayerType_buf(allocBufWrapper(devAccIn, nMod)), + + connectedPixels_buf(allocBufWrapper(devAccIn, nPixs)) + {} }; // PixelMap is never allocated on the device. @@ -267,20 +423,428 @@ namespace SDL } }; - extern std::map * detIdToIndex; - extern std::map *module_x; - extern std::map *module_y; - extern std::map *module_z; - extern std::map *module_type; - - void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt"); - void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream); - void freeModules(struct modules& modulesInGPU); - void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream); - void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); - void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream); - void fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules); - void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r); - void createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules); + template + inline void fillPixelMap(struct modulesBuffer* modulesBuf, struct pixelMap& pixelMapping, TQueue queue) + { + std::vector connectedModuleDetIds; + std::vector connectedModuleDetIds_pos; + std::vector connectedModuleDetIds_neg; + + int totalSizes = 0; + int totalSizes_pos = 0; + int totalSizes_neg = 0; + for(unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++) + { + std::vector connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values + std::vector connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins + std::vector connectedModuleDetIds_pLStoLayer3Subdet5 = SDL::moduleConnectionMap_pLStoLayer3Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins); + std::vector connectedModuleDetIds_pLStoLayer1Subdet4 = SDL::moduleConnectionMap_pLStoLayer1Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); + std::vector connectedModuleDetIds_pLStoLayer2Subdet4 = SDL::moduleConnectionMap_pLStoLayer2Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); + std::vector connectedModuleDetIds_pLStoLayer3Subdet4 = SDL::moduleConnectionMap_pLStoLayer3Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); + std::vector connectedModuleDetIds_pLStoLayer4Subdet4 = SDL::moduleConnectionMap_pLStoLayer4Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet5.begin(),connectedModuleDetIds_pLStoLayer1Subdet5.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet5.begin(),connectedModuleDetIds_pLStoLayer2Subdet5.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet5.begin(),connectedModuleDetIds_pLStoLayer3Subdet5.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet4.begin(),connectedModuleDetIds_pLStoLayer1Subdet4.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet4.begin(),connectedModuleDetIds_pLStoLayer2Subdet4.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end()); + connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end()); + + int sizes = 0; + sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size(); + sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size(); + sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size(); + sizes += connectedModuleDetIds_pLStoLayer1Subdet4.size(); + sizes += connectedModuleDetIds_pLStoLayer2Subdet4.size(); + sizes += connectedModuleDetIds_pLStoLayer3Subdet4.size(); + sizes += connectedModuleDetIds_pLStoLayer4Subdet4.size(); + pixelMapping.connectedPixelsIndex[isuperbin] = totalSizes; + pixelMapping.connectedPixelsSizes[isuperbin] = sizes; + totalSizes += sizes; + + std::vector connectedModuleDetIds_pLStoLayer1Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet5_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer2Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet5_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer3Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet5_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer1Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet4_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer2Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet4_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer3Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet4_pos.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer4Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer4Subdet4_pos.getConnectedModuleDetIds(isuperbin); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end()); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end()); + + int sizes_pos = 0; + sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet4_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet4_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet4_pos.size(); + sizes_pos += connectedModuleDetIds_pLStoLayer4Subdet4_pos.size(); + pixelMapping.connectedPixelsIndexPos[isuperbin] = totalSizes_pos; + pixelMapping.connectedPixelsSizesPos[isuperbin] = sizes_pos; + totalSizes_pos += sizes_pos; + + std::vector connectedModuleDetIds_pLStoLayer1Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet5_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer2Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet5_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer3Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet5_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer1Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet4_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer2Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet4_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer3Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet4_neg.getConnectedModuleDetIds(isuperbin); + std::vector connectedModuleDetIds_pLStoLayer4Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer4Subdet4_neg.getConnectedModuleDetIds(isuperbin); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.end()); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.end()); + + int sizes_neg = 0; + sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet5_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet5_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet5_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet4_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet4_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet4_neg.size(); + sizes_neg += connectedModuleDetIds_pLStoLayer4Subdet4_neg.size(); + pixelMapping.connectedPixelsIndexNeg[isuperbin] = totalSizes_neg; + pixelMapping.connectedPixelsSizesNeg[isuperbin] = sizes_neg; + totalSizes_neg += sizes_neg; + } + + auto connectedPixels_buf = allocBufWrapper(devHost, totalSizes + totalSizes_pos + totalSizes_neg); + unsigned int* connectedPixels = alpaka::getPtrNative(connectedPixels_buf); + + for(int icondet = 0; icondet < totalSizes; icondet++) + { + connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]]; + } + for(int icondet = 0; icondet < totalSizes_pos; icondet++) + { + connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]]; + } + for(int icondet = 0; icondet < totalSizes_neg; icondet++) + { + connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]]; + } + + alpaka::memcpy(queue, modulesBuf->connectedPixels_buf, connectedPixels_buf, totalSizes + totalSizes_pos + totalSizes_neg); + alpaka::wait(queue); + }; + + template + inline void fillConnectedModuleArrayExplicit(struct modulesBuffer* modulesBuf, unsigned int nMod, TQueue queue) + { + auto moduleMap_buf = allocBufWrapper(devHost, nMod * 40); + uint16_t* moduleMap = alpaka::getPtrNative(moduleMap_buf); + + auto nConnectedModules_buf = allocBufWrapper(devHost, nMod); + uint16_t* nConnectedModules = alpaka::getPtrNative(nConnectedModules_buf); + + for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it) + { + unsigned int detId = it->first; + uint16_t index = it->second; + auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId); + nConnectedModules[index] = connectedModules.size(); + for(uint16_t i = 0; i< nConnectedModules[index];i++) + { + moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]]; + } + } + + alpaka::memcpy(queue, modulesBuf->moduleMap_buf, moduleMap_buf, nMod * 40); + alpaka::memcpy(queue, modulesBuf->nConnectedModules_buf, nConnectedModules_buf, nMod); + alpaka::wait(queue); + }; + + template + inline void fillMapArraysExplicit(struct modulesBuffer* modulesBuf, unsigned int nMod, TQueue queue) + { + auto mapIdx_buf = allocBufWrapper(devHost, nMod); + uint16_t* mapIdx = alpaka::getPtrNative(mapIdx_buf); + + auto mapdetId_buf = allocBufWrapper(devHost, nMod); + unsigned int* mapdetId = alpaka::getPtrNative(mapdetId_buf); + + unsigned int counter = 0; + for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it) + { + unsigned int detId = it->first; + unsigned int index = it->second; + mapIdx[counter] = index; + mapdetId[counter] = detId; + counter++; + } + + alpaka::memcpy(queue, modulesBuf->mapIdx_buf, mapIdx_buf, nMod); + alpaka::memcpy(queue, modulesBuf->mapdetId_buf, mapdetId_buf, nMod); + alpaka::wait(queue); + }; + + inline void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r) + { + subdet = (detId & (7 << 25)) >> 25; + side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18; + layer = (subdet == Endcap) ? (detId & (7 << 18)) >> 18 : (detId & (7 << 20)) >> 20; + ring = (subdet == Endcap) ? (detId & (15 << 12)) >> 12 : 0; + module = (detId & (127 << 2)) >> 2; + rod = (subdet == Endcap) ? 0 : (detId & (127 << 10)) >> 10; + + r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z); + eta = ((m_z > 0) - ( m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y)); + }; + + template + void loadModulesFromFile(struct modules* modulesInGPU, + struct modulesBuffer* modulesBuf, + uint16_t& nModules, + uint16_t& nLowerModules, + struct pixelMap& pixelMapping, + cudaStream_t stream, + TQueue& queue, + const char* moduleMetaDataFilePath) + { + detIdToIndex = new std::map; + module_x = new std::map; + module_y = new std::map; + module_z = new std::map; + module_type = new std::map; + + /* Load the whole text file into the map first*/ + + std::ifstream ifile; + ifile.open(moduleMetaDataFilePath); + if(!ifile.is_open()) + { + std::cout<<"ERROR! module list file not present!"<4) + break; + } + + } + (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list + counter++; + nModules = counter; + + auto detIds_buf = allocBufWrapper(devHost, nModules); + auto layers_buf = allocBufWrapper(devHost, nModules); + auto rings_buf = allocBufWrapper(devHost, nModules); + auto rods_buf = allocBufWrapper(devHost, nModules); + auto modules_buf = allocBufWrapper(devHost, nModules); + auto subdets_buf = allocBufWrapper(devHost, nModules); + auto sides_buf = allocBufWrapper(devHost, nModules); + auto eta_buf = allocBufWrapper(devHost, nModules); + auto r_buf = allocBufWrapper(devHost, nModules); + auto isInverted_buf = allocBufWrapper(devHost, nModules); + auto isLower_buf = allocBufWrapper(devHost, nModules); + auto isAnchor_buf = allocBufWrapper(devHost, nModules); + auto moduleType_buf = allocBufWrapper(devHost, nModules); + auto moduleLayerType_buf = allocBufWrapper(devHost, nModules); + auto slopes_buf = allocBufWrapper(devHost, nModules); + auto drdzs_buf = allocBufWrapper(devHost, nModules); + auto partnerModuleIndices_buf = allocBufWrapper(devHost, nModules); + + // Getting the underlying data pointers + unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf); + short* host_layers = alpaka::getPtrNative(layers_buf); + short* host_rings = alpaka::getPtrNative(rings_buf); + short* host_rods = alpaka::getPtrNative(rods_buf); + short* host_modules = alpaka::getPtrNative(modules_buf); + short* host_subdets = alpaka::getPtrNative(subdets_buf); + short* host_sides = alpaka::getPtrNative(sides_buf); + float* host_eta = alpaka::getPtrNative(eta_buf); + float* host_r = alpaka::getPtrNative(r_buf); + bool* host_isInverted = alpaka::getPtrNative(isInverted_buf); + bool* host_isLower = alpaka::getPtrNative(isLower_buf); + bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf); + ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf); + ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf); + float* host_slopes = alpaka::getPtrNative(slopes_buf); + float* host_drdzs = alpaka::getPtrNative(drdzs_buf); + uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf); + + //reassign detIdToIndex indices here + nLowerModules = (nModules - 1) / 2; + uint16_t lowerModuleCounter = 0; + uint16_t upperModuleCounter = nLowerModules + 1; + //0 to nLowerModules - 1 => only lower modules, nLowerModules - pixel module, nLowerModules + 1 to nModules => upper modules + for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++) + { + unsigned int detId = it->first; + float m_x = (*module_x)[detId]; + float m_y = (*module_y)[detId]; + float m_z = (*module_z)[detId]; + unsigned int m_t = (*module_type)[detId]; + + float eta,r; + + uint16_t index; + unsigned short layer,ring,rod,module,subdet,side; + bool isInverted, isLower; + if(detId == 1) + { + layer = 0; + ring = 0; + rod = 0; + module = 0; + subdet = 0; + side = 0; + isInverted = false; + isLower = false; + } + else + { + setDerivedQuantities(detId,layer,ring,rod,module,subdet,side,m_x,m_y,m_z,eta,r); + isInverted = modulesInGPU->parseIsInverted(subdet, side, module, layer); + isLower = modulesInGPU->parseIsLower(isInverted, detId); + } + if(isLower) + { + index = lowerModuleCounter; + lowerModuleCounter++; + } + else if(detId != 1) + { + index = upperModuleCounter; + upperModuleCounter++; + } + else + { + index = nLowerModules; //pixel + } + //reassigning indices! + (*detIdToIndex)[detId] = index; + host_detIds[index] = detId; + host_layers[index] = layer; + host_rings[index] = ring; + host_rods[index] = rod; + host_modules[index] = module; + host_subdets[index] = subdet; + host_sides[index] = side; + host_eta[index] = eta; + host_r[index] = r; + host_isInverted[index] = isInverted; + host_isLower[index] = isLower; + + //assigning other variables! + if(detId == 1) + { + host_moduleType[index] = PixelModule; + host_moduleLayerType[index] = SDL::InnerPixelLayer; + host_slopes[index] = 0; + host_drdzs[index] = 0; + host_isAnchor[index] = false; + } + else + { + host_moduleType[index] = ( m_t == 25 ? SDL::TwoS : SDL::PS ); + host_moduleLayerType[index] = ( m_t == 23 ? SDL::Pixel : SDL::Strip ); + + if(host_moduleType[index] == SDL::PS and host_moduleLayerType[index] == SDL::Pixel) + { + host_isAnchor[index] = true; + } + else if(host_moduleType[index] == SDL::TwoS and host_isLower[index]) + { + host_isAnchor[index] = true; + } + else + { + host_isAnchor[index] = false; + } + + host_slopes[index] = (subdet == Endcap) ? endcapGeometry.getSlopeLower(detId) : tiltedGeometry.getSlope(detId); + host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry.getDrDz(detId) : 0; + } + } + + //partner module stuff, and slopes and drdz move around + for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++) + { + auto& detId = it->first; + auto& index = it->second; + if(detId != 1) + { + host_partnerModuleIndices[index] = (*detIdToIndex)[modulesInGPU->parsePartnerModuleId(detId, host_isLower[index], host_isInverted[index])]; + //add drdz and slope importing stuff here! + if(host_drdzs[index] == 0) + { + host_drdzs[index] = host_drdzs[host_partnerModuleIndices[index]]; + } + if(host_slopes[index] == 0) + { + host_slopes[index] = host_slopes[host_partnerModuleIndices[index]]; + } + } + } + + cudaMemcpyAsync(modulesInGPU->nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); + cudaMemcpyAsync(modulesInGPU->nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); + cudaStreamSynchronize(stream); + + cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream); + cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream); + cudaStreamSynchronize(stream); + + //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules); + //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules); + + alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf, nModules); + alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf, nModules); + alpaka::memcpy(queue, modulesBuf->rings_buf, rings_buf, nModules); + alpaka::memcpy(queue, modulesBuf->rods_buf, rods_buf, nModules); + alpaka::memcpy(queue, modulesBuf->modules_buf, modules_buf, nModules); + alpaka::memcpy(queue, modulesBuf->subdets_buf, subdets_buf, nModules); + alpaka::memcpy(queue, modulesBuf->sides_buf, sides_buf, nModules); + alpaka::memcpy(queue, modulesBuf->eta_buf, eta_buf, nModules); + alpaka::memcpy(queue, modulesBuf->r_buf, r_buf, nModules); + alpaka::memcpy(queue, modulesBuf->isInverted_buf, isInverted_buf, nModules); + alpaka::memcpy(queue, modulesBuf->isLower_buf, isLower_buf, nModules); + alpaka::memcpy(queue, modulesBuf->isAnchor_buf, isAnchor_buf, nModules); + alpaka::memcpy(queue, modulesBuf->slopes_buf, slopes_buf, nModules); + alpaka::memcpy(queue, modulesBuf->drdzs_buf, drdzs_buf, nModules); + alpaka::memcpy(queue, modulesBuf->partnerModuleIndices_buf, partnerModuleIndices_buf, nModules); + alpaka::wait(queue); + + fillConnectedModuleArrayExplicit(modulesBuf, nModules, queue); + fillMapArraysExplicit(modulesBuf, nModules, queue); + fillPixelMap(modulesBuf, pixelMapping, queue); + }; } #endif diff --git a/SDL/allocate.cc b/SDL/allocate.cc deleted file mode 100644 index 1e926fb9..00000000 --- a/SDL/allocate.cc +++ /dev/null @@ -1,66 +0,0 @@ -#include - -#include "allocate.h" -#include "cudaCheck.h" - -#include "getCachingAllocator.h" - -namespace { - const size_t maxAllocationSize = - notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); -} - -namespace cms::cuda { - // void *allocate_managed(unsigned int nbytes, cudaStream_t stream) { - void *allocate_managed(size_t nbytes, cudaStream_t stream) { - void *ptr = nullptr; -// if constexpr (allocator::useCaching) { - if (nbytes > maxAllocationSize) { - throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + - " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); - } - cudaCheck(allocator::getCachingManagedAllocator().ManagedAllocate(&ptr, nbytes, stream)); -// } else { -// cudaCheck(cudaMallocManaged(&ptr, nbytes)); -// } - return ptr; - } - - void free_managed(void *ptr) { - //if constexpr (allocator::useCaching) { - cudaCheck(allocator::getCachingManagedAllocator().ManagedFree(ptr)); - //} else { - // cudaCheck(cudaFree(ptr)); - //} - } - - void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) { - void *ptr = nullptr; - if (nbytes > maxAllocationSize) { - std::cout<<"at stream"< maxAllocationSize) { - throw std::runtime_error("allocate_host: Tried to allocate " + std::to_string(nbytes) + - " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); - } - cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream)); - return ptr; - } - - void free_host(void *ptr) { - cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr)); - } - -} // namespace cms::cuda diff --git a/SDL/allocate.h b/SDL/allocate.h deleted file mode 100644 index 2a3698de..00000000 --- a/SDL/allocate.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_allocate_managed_h -#define HeterogeneousCore_CUDAUtilities_allocate_managed_h - -#include - -namespace cms { - namespace cuda { - // Allocate managed memory (to be called from unique_ptr) - //void *allocate_managed(unsigned int nbytes, cudaStream_t stream); - void *allocate_managed(size_t nbytes, cudaStream_t stream); - void *allocate_device(int dev, size_t nbytes, cudaStream_t stream); - void *allocate_host(size_t nbytes, cudaStream_t stream); - - // Free managed memory (to be called from unique_ptr) - void free_managed(void *ptr); - void free_device(int dev, void *ptr); - void free_host(void *ptr); - } // namespace cuda -} // namespace cms - -#endif diff --git a/SDL/cudaCheck.h b/SDL/cudaCheck.h deleted file mode 100644 index 821bfcff..00000000 --- a/SDL/cudaCheck.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_cudaCheck_h -#define HeterogeneousCore_CUDAUtilities_cudaCheck_h - -// C++ standard headers -#include -#include -#include - -// CUDA headers -#include -#include - -namespace cms { - namespace cuda { - - [[noreturn]] inline void abortOnCudaError(const char* file, - int line, - const char* cmd, - const char* error, - const char* message, - const char* description = nullptr) { - std::ostringstream out; - out << "\n"; - out << file << ", line " << line << ":\n"; - out << "cudaCheck(" << cmd << ");\n"; - out << error << ": " << message << "\n"; - if (description) - out << description << "\n"; - throw std::runtime_error(out.str()); - } - - inline bool cudaCheck_( - const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) { - if (result == CUDA_SUCCESS) - return true; - - const char* error; - const char* message; - cuGetErrorName(result, &error); - cuGetErrorString(result, &message); - abortOnCudaError(file, line, cmd, error, message, description); - return false; - } - - inline bool cudaCheck_( - const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) { - if (result == cudaSuccess) - return true; - - const char* error = cudaGetErrorName(result); - const char* message = cudaGetErrorString(result); - abortOnCudaError(file, line, cmd, error, message, description); - return false; - } - - } // namespace cuda -} // namespace cms - -#define cudaCheck(ARG, ...) (cms::cuda::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__)) - -#endif // HeterogeneousCore_CUDAUtilities_cudaCheck_h diff --git a/SDL/deviceCount.h b/SDL/deviceCount.h deleted file mode 100644 index 407f6093..00000000 --- a/SDL/deviceCount.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h -#define HeterogenousCore_CUDAUtilities_deviceCount_h - -#include "cudaCheck.h" - -#include - -namespace cms { - namespace cuda { - inline int deviceCount() { - int ndevices; - cudaCheck(cudaGetDeviceCount(&ndevices)); - return ndevices; - } - } // namespace cuda -} // namespace cms - -#endif diff --git a/SDL/getCachingAllocator.h b/SDL/getCachingAllocator.h deleted file mode 100644 index c81afecd..00000000 --- a/SDL/getCachingAllocator.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_src_getCachingAllocator -#define HeterogeneousCore_CUDACore_src_getCachingAllocator - -#include "cudaCheck.h" -#include "deviceCount.h" -#include "CachingDeviceAllocator.h" -#include "CachingManagedAllocator.h" -#include "CachingHostAllocator.h" - -namespace cms::cuda::allocator { - // Use caching or not - constexpr bool useCaching = true; - // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 2;//9; - // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 8;//1; - // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 30;//10; - // Total storage for the allocator. 0 means no limit. - constexpr size_t maxCachedBytes = 0; - // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. - constexpr double maxCachedFraction = 0.8; - constexpr bool debug = false; - - inline size_t minCachedBytes() { - size_t ret = std::numeric_limits::max(); - int currentDevice; - cudaCheck(cudaGetDevice(¤tDevice)); - const int numberOfDevices = deviceCount(); - for (int i = 0; i < numberOfDevices; ++i) { - size_t freeMemory, totalMemory; - cudaCheck(cudaSetDevice(i)); - cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); - ret = std::min(ret, static_cast(maxCachedFraction * freeMemory)); - } - cudaCheck(cudaSetDevice(currentDevice)); - if (maxCachedBytes > 0) { - ret = std::min(ret, maxCachedBytes); - } - return ret; - } - - inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { - // the public interface is thread safe - static notcub::CachingDeviceAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; - return allocator; - } - - inline notcub::CachingManagedAllocator& getCachingManagedAllocator() { - static notcub::CachingManagedAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; - return allocator; - } - - inline notcub::CachingHostAllocator& getCachingHostAllocator() { - static notcub::CachingHostAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; - return allocator; - } -} // namespace cms::cuda::allocator - -#endif diff --git a/bin/sdl.cc b/bin/sdl.cc index beb15dc0..74ff1fd4 100644 --- a/bin/sdl.cc +++ b/bin/sdl.cc @@ -506,8 +506,6 @@ void run_sdl() printTimingInformation(timevec, full_elapsed, avg_elapsed); - SDL::cleanModules(); - if (ana.do_write_ntuple) { // Writing ttree output to file diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc index f139982b..eaa33df4 100644 --- a/code/core/AccessHelper.cc +++ b/code/core/AccessHelper.cc @@ -31,7 +31,7 @@ std::vector getPixelHitsFrompLS(SDL::Event* event, unsigned int pL SDL::segmentsBuffer& segments_ = *(event->getSegments()); SDL::miniDoubletsBuffer& miniDoublets_ = *(event->getMiniDoublets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; unsigned int MD_1 = segments_.mdIndices[2 * (pLS + pLS_offset)]; unsigned int MD_2 = segments_.mdIndices[2 * (pLS + pLS_offset) + 1]; @@ -243,7 +243,7 @@ unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3) { SDL::pixelTripletsBuffer& pixelTriplets_ = *(event->getPixelTriplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; return pixelTriplets_.pixelSegmentIndices[pT3] - pLS_offset; } @@ -343,7 +343,7 @@ unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5) { SDL::pixelQuintupletsBuffer& pixelQuintuplets_ = *(event->getPixelQuintuplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)]; return pixelQuintuplets_.pixelIndices[pT5] - pLS_offset; } diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc index 33f90068..eb519b9e 100644 --- a/code/core/write_sdl_ntuple.cc +++ b/code/core/write_sdl_ntuple.cc @@ -308,7 +308,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event) SDL::pixelQuintupletsBuffer& pixelQuintupletsInGPU = (*event->getPixelQuintuplets()); SDL::quintupletsBuffer& quintupletsInGPU = (*event->getQuintuplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); const float kRinv1GeVf = (2.99792458e-3 * 3.8); @@ -393,7 +393,7 @@ void setQuintupletOutputBranches(SDL::Event* event) { SDL::quintupletsBuffer& quintupletsInGPU = (*event->getQuintuplets()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); const float kRinv1GeVf = (2.99792458e-3 * 3.8); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -475,7 +475,7 @@ void setPixelTripletOutputBranches(SDL::Event* event) { SDL::pixelTripletsBuffer& pixelTripletsInGPU = (*event->getPixelTriplets()); SDL::tripletsBuffer& tripletsInGPU = *(event->getTriplets()); - SDL::modules& modulesInGPU = *(event->getModules()); + SDL::modulesBuffer& modulesInGPU = *(event->getModules()); SDL::segmentsBuffer& segmentsInGPU = *(event->getSegments()); SDL::hitsBuffer& hitsInGPU = *(event->getHits()); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); @@ -562,7 +562,7 @@ void setGnnNtupleBranches(SDL::Event* event) SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event->getTrackCandidates()); @@ -1106,7 +1106,7 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f //________________________________________________________________________________________________________________________________ void printHitMultiplicities(SDL::Event* event) { - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); int nHits = 0; @@ -1122,7 +1122,7 @@ void printHitMultiplicities(SDL::Event* event) void printMiniDoubletMultiplicities(SDL::Event* event) { SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); int nMiniDoublets = 0; int totOccupancyMiniDoublets = 0; @@ -1152,7 +1152,7 @@ void printMDs(SDL::Event* event) { SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); // Then obtain the lower module index @@ -1176,7 +1176,7 @@ void printLSs(SDL::Event* event) SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); int nSegments = 0; @@ -1209,7 +1209,7 @@ void printpLSs(SDL::Event* event) SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); unsigned int i = *(modulesInGPU.nLowerModules); @@ -1240,7 +1240,7 @@ void printT3s(SDL::Event* event) SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); SDL::hitsBuffer& hitsInGPU = (*event->getHits()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); int nTriplets = 0; for (unsigned int i = 0; i < *(modulesInGPU.nLowerModules); ++i) { @@ -1282,7 +1282,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event) SDL::tripletsBuffer& tripletsInGPU = (*event->getTriplets()); SDL::segmentsBuffer& segmentsInGPU = (*event->getSegments()); SDL::miniDoubletsBuffer& miniDoubletsInGPU = (*event->getMiniDoublets()); - SDL::modules& modulesInGPU = (*event->getModules()); + SDL::modulesBuffer& modulesInGPU = (*event->getModules()); SDL::objectRangesBuffer& rangesInGPU = (*event->getRanges()); //int nTrackCandidates = 0; for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx) From 68b7b7b065f76982c7c01798ebd7577c2a4be00c Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 13:22:48 -0700 Subject: [PATCH 30/44] first working alpaka everything --- Makefile | 16 ++++---- SDL/Constants.cuh | 29 +++++++++++--- SDL/Event.cu | 95 ++++++++++++++++++++++++++++++++------------ SDL/Event.cuh | 5 +-- SDL/LST.cc | 5 +-- SDL/LST.h | 3 +- SDL/Makefile | 27 +++++-------- SDL/Module.cuh | 16 +++++--- SDL/PixelTriplet.cuh | 12 +++--- SDL/Quintuplet.cuh | 6 +-- bin/sdl.cc | 6 +-- setup_cgpu.sh | 6 +-- 12 files changed, 139 insertions(+), 87 deletions(-) diff --git a/Makefile b/Makefile index 44c520d9..c57369d3 100644 --- a/Makefile +++ b/Makefile @@ -9,19 +9,19 @@ SOURCES=$(wildcard code/core/*.cc) #$(wildcard SDL/*.cc) OBJECTS=$(SOURCES:.cc=.o) $(wildcard ${TRACKLOOPERDIR}/SDL/libsdl.so) HEADERS=$(SOURCES:.cc=.h) -CC = nvcc -CXX = nvcc -CXXFLAGS = -g -O2 --compiler-options -Wall --compiler-options -fPIC --compiler-options -Wshadow --compiler-options -Woverloaded-virtual -G -lineinfo -fopenmp -lgomp --default-stream per-thread +CC = g++ +CXX = g++ +CXXFLAGS = -g -O2 -Wall -fPIC -std=c++17 -Wshadow -Woverloaded-virtual -lineinfo -fopenmp -lgomp --default-stream per-thread LD = g++ LDFLAGS = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual -I/mnt/data1/dsr/cub SOFLAGS = -g -shared CXXFLAGS = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual LDFLAGS = -g -O2 ROOTLIBS = $(shell root-config --libs) -ROOTCFLAGS = $(foreach option, $(shell root-config --cflags), --compiler-options $(option)) -ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -CFLAGS = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -EXTRACFLAGS = $(shell rooutil-config) +ROOTCFLAGS = $(foreach option, $(shell root-config --cflags), $(option)) +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_DEBUG=0 +CFLAGS = $(ROOTCFLAGS) -Wall -Wno-unused-function -g -O2 -fPIC -fno-var-tracking -ISDL -I$(shell pwd) -Icode -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include -fopenmp +EXTRACFLAGS = $(shell rooutil-config) -g EXTRAFLAGS = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET PT0P8 = @@ -53,7 +53,7 @@ bin/sdl: bin/sdl.o $(OBJECTS) $(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@ %.o: %.cc - $(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -dc -o $@ + $(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -c -o $@ $(ROOUTIL): $(MAKE) -C code/rooutil/ diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 49412d68..606fe7f0 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -1,14 +1,17 @@ #ifndef Constants_cuh #define Constants_cuh -#include #include // CUDA headers. Will be removed soon. +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED #include +#include #include +#endif -#ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters +//This changes pT5 and pT3 and T3 completely. T5 for non regression parameters +#if defined(FP16_Base) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) #define __F2H __float2half #define __H2F __half2float typedef __half FPX; @@ -17,7 +20,8 @@ typedef __half FPX; #define __H2F typedef float FPX; #endif -#ifdef FP16_T5 // changes T5 regression values + +#if defined(FP16_T5) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes T5 regression values #define __F2H_T5 __float2half #define __H2F_T5 __half2float typedef __half FPX_T5; @@ -26,7 +30,8 @@ typedef __half FPX_T5; #define __H2F_T5 typedef float FPX_T5; #endif -#ifdef FP16_dPhi // changes segment dPhi values + +#if defined(FP16_dPhi) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment dPhi values #define __F2H_dPhi __float2half #define __H2F_dPhi __half2float typedef __half FPX_dPhi; @@ -35,7 +40,8 @@ typedef __half FPX_dPhi; #define __H2F_dPhi typedef float FPX_dPhi; #endif -#ifdef FP16_circle // changes segment circle values + +#if defined(FP16_circle) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment circle values #define __F2H_circle __float2half #define __H2F_circle __half2float typedef __half FPX_circle; @@ -44,7 +50,8 @@ typedef __half FPX_circle; #define __H2F_circle typedef float FPX_circle; #endif -#ifdef FP16_seg // changes segment values + +#if defined(FP16_seg) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment values #define __F2H_seg __float2half #define __H2F_seg __half2float typedef __half FPX_seg; @@ -78,6 +85,16 @@ Vec const elementsPerThread(Vec::all(static_cast(1))); using Acc = alpaka::AccCpuSerial; #endif +#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED +struct uint4 +{ + unsigned int x; + unsigned int y; + unsigned int z; + unsigned int w; +}; +#endif + auto const devHost = alpaka::getDevByIdx(0u); auto const devAcc = alpaka::getDevByIdx(0u); using QueueAcc = alpaka::Queue; diff --git a/SDL/Event.cu b/SDL/Event.cu index eab44436..aee9e862 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -6,9 +6,8 @@ std::shared_ptr SDL::pixelMapping = std::make_shared(); uint16_t SDL::nModules; uint16_t SDL::nLowerModules; -SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx(0u)) +SDL::Event::Event(bool verbose): queue(alpaka::getDevByIdx(0u)) { - stream = estream; addObjects = verbose; hitsInGPU = nullptr; mdsInGPU = nullptr; @@ -152,7 +151,6 @@ void SDL::Event::resetEvent() void SDL::initModules(const char* moduleMetaDataFilePath) { - cudaStream_t default_stream = 0; QueueAcc queue(devAcc); // Set the relevant data pointers. @@ -164,7 +162,6 @@ void SDL::initModules(const char* moduleMetaDataFilePath) nModules, nLowerModules, *pixelMapping, - default_stream, queue, moduleMetaDataFilePath); } @@ -243,12 +240,20 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,std::vector hitIndices1,std::vector hitIndices2,std::vector hitIndices3, std::vector dPhiChange, std::vector ptIn, std::vector ptErr, std::vector px, std::vector py, std::vector pz, std::vector eta, std::vector etaErr, std::vector phi, std::vector charge, std::vector seedIdx, std::vector superbin, std::vector pixelType, std::vector isQuad) { const int size = ptIn.size(); - unsigned int mdSize = 2 * size; + int mdSize = 2 * size; uint16_t pixelModuleIndex = (*detIdToIndex)[1]; if(mdsInGPU == nullptr) { - cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream); + // Create a view for the element nLowerModules inside rangesBuffers->miniDoubletModuleOccupancy + auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx) 1u, (Idx) nLowerModules); + + // Create a source view for the value to be set + int value = N_MAX_PIXEL_MD_PER_MODULES; + auto src_view_value = alpaka::createView(devHost, &value, (Idx) 1u); + + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); + alpaka::wait(queue); Vec const threadsPerBlockCreateMD(static_cast(1), static_cast(1), static_cast(1024)); Vec const blocksPerGridCreateMD(static_cast(1), static_cast(1), static_cast(1)); @@ -265,16 +270,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::wait(queue); unsigned int nTotalMDs; - cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTotalMDs_view = alpaka::createView(devHost, &nTotalMDs, (Idx) 1u); + + alpaka::memcpy(queue, nTotalMDs_view, rangesBuffers->device_nTotalMDs_buf); + alpaka::wait(queue); + nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; mdsInGPU = new SDL::miniDoublets(); miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules, devAcc, queue); mdsInGPU->setData(*miniDoubletsBuffers); - cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, miniDoubletsBuffers->nMemoryLocations_buf, nTotalMDs_view); + alpaka::wait(queue); } if(segmentsInGPU == nullptr) { @@ -296,16 +304,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::enqueue(queue, createSegmentArrayRangesTask); alpaka::wait(queue); - cudaMemcpyAsync(&nTotalSegments,rangesInGPU->device_nTotalSegs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream); - cudaStreamSynchronize(stream); + auto nTotalSegments_view = alpaka::createView(devHost, &nTotalSegments, (Idx) 1u); + + alpaka::memcpy(queue, nTotalSegments_view, rangesBuffers->device_nTotalSegs_buf); + alpaka::wait(queue); + nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE; segmentsInGPU = new SDL::segments(); segmentsBuffers = new SDL::segmentsBuffer(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); segmentsInGPU->setData(*segmentsBuffers); - cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);; - cudaStreamSynchronize(stream); + alpaka::memcpy(queue, segmentsBuffers->nMemoryLocations_buf, nTotalSegments_view); + alpaka::wait(queue); } auto hitIndices0_dev = allocBufWrapper(devAcc, size); @@ -334,11 +345,22 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::memcpy(queue, segmentsBuffers->superbin_buf, superbin, size); alpaka::memcpy(queue, segmentsBuffers->pixelType_buf, pixelType, size); - cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(&(mdsInGPU->nMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(&(mdsInGPU->totOccupancyMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream); - cudaStreamSynchronize(stream); + // Create source views for size and mdSize + auto src_view_size = alpaka::createView(devHost, &size, (Idx) 1u); + auto src_view_mdSize = alpaka::createView(devHost, &mdSize, (Idx) 1u); + + auto dst_view_segments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) pixelModuleIndex); + alpaka::memcpy(queue, dst_view_segments, src_view_size); + + auto dst_view_totOccupancySegments = alpaka::createSubView(segmentsBuffers->totOccupancySegments_buf, (Idx) 1u, (Idx) pixelModuleIndex); + alpaka::memcpy(queue, dst_view_totOccupancySegments, src_view_size); + + auto dst_view_nMDs = alpaka::createSubView(miniDoubletsBuffers->nMDs_buf, (Idx) 1u, (Idx) pixelModuleIndex); + alpaka::memcpy(queue, dst_view_nMDs, src_view_mdSize); + + auto dst_view_totOccupancyMDs = alpaka::createSubView(miniDoubletsBuffers->totOccupancyMDs_buf, (Idx) 1u, (Idx) pixelModuleIndex); + alpaka::memcpy(queue, dst_view_totOccupancyMDs, src_view_mdSize); + alpaka::wait(queue); Vec const threadsPerBlock(static_cast(1), static_cast(1), static_cast(256)); @@ -381,6 +403,7 @@ void SDL::Event::addMiniDoubletsToEventExplicit() alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2); alpaka::wait(queue); + int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); short* module_subdets = alpaka::getPtrNative(module_subdets_buf); short* module_layers = alpaka::getPtrNative(module_layers_buf); @@ -398,7 +421,6 @@ void SDL::Event::addMiniDoubletsToEventExplicit() { n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i]; } - } } } @@ -415,6 +437,7 @@ void SDL::Event::addSegmentsToEventExplicit() alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); alpaka::wait(queue); + int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); short* module_subdets = alpaka::getPtrNative(module_subdets_buf); short* module_layers = alpaka::getPtrNative(module_layers_buf); @@ -437,7 +460,15 @@ void SDL::Event::addSegmentsToEventExplicit() void SDL::Event::createMiniDoublets() { - cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream); + // Create a view for the element nLowerModules inside rangesBuffers->miniDoubletModuleOccupancy + auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx) 1u, (Idx) nLowerModules); + + // Create a source view for the value to be set + int value = N_MAX_PIXEL_MD_PER_MODULES; + auto src_view_value = alpaka::createView(devHost, &value, (Idx) 1u); + + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); + alpaka::wait(queue); Vec const threadsPerBlockCreateMD(static_cast(1), static_cast(1), static_cast(1024)); Vec const blocksPerGridCreateMD(static_cast(1), static_cast(1), static_cast(1)); @@ -679,7 +710,7 @@ void SDL::Event::createTrackCandidates() Vec const threadsPerBlock_crossCleanpT3(static_cast(1), static_cast(16), static_cast(64)); Vec const blocksPerGrid_crossCleanpT3(static_cast(1), static_cast(4), static_cast(20)); - WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, blocksPerGrid_crossCleanpT3, elementsPerThread); + WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, threadsPerBlock_crossCleanpT3, elementsPerThread); SDL::crossCleanpT3 crossCleanpT3_kernel; auto const crossCleanpT3Task(alpaka::createTaskKernel( @@ -811,8 +842,13 @@ void SDL::Event::createPixelTriplets() pixelTripletsInGPU->setData(*pixelTripletsBuffers); } - unsigned int nInnerSegments; - cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(int), cudaMemcpyDeviceToHost,stream); + int nInnerSegments; + auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t) 1u); + + auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) nLowerModules); + + alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); + alpaka::wait(queue); auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); @@ -1043,8 +1079,14 @@ void SDL::Event::createPixelQuintuplets() trackCandidatesInGPU->setData(*trackCandidatesBuffers); } - unsigned int nInnerSegments; - cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); + int nInnerSegments; + auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t) 1u); + + // Create a sub-view for the device buffer + auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) nLowerModules); + + alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); + alpaka::wait(queue); auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); @@ -1179,6 +1221,7 @@ void SDL::Event::addQuintupletsToEventExplicit() alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules); alpaka::wait(queue); + int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf); short* module_subdets = alpaka::getPtrNative(module_subdets_buf); short* module_layers = alpaka::getPtrNative(module_layers_buf); diff --git a/SDL/Event.cuh b/SDL/Event.cuh index 9b70014f..948f2207 100644 --- a/SDL/Event.cuh +++ b/SDL/Event.cuh @@ -18,7 +18,6 @@ namespace SDL { private: QueueAcc queue; - cudaStream_t stream; bool addObjects; std::array n_hits_by_layer_barrel_; @@ -35,7 +34,7 @@ namespace SDL std::array n_quintuplets_by_layer_endcap_; //Device stuff - int nTotalSegments; + unsigned int nTotalSegments; struct objectRanges* rangesInGPU; struct objectRangesBuffer* rangesBuffers; struct hits* hitsInGPU; @@ -71,7 +70,7 @@ namespace SDL int* superbinCPU; int8_t* pixelTypeCPU; public: - Event(cudaStream_t estream,bool verbose); + Event(bool verbose); void resetEvent(); void addHitToEvent(std::vector x, std::vector y, std::vector z, std::vector detId, std::vector idxInNtuple); //call the appropriate hit function, then increment the counter here diff --git a/SDL/LST.cc b/SDL/LST.cc index 9f9930c3..977cb642 100644 --- a/SDL/LST.cc +++ b/SDL/LST.cc @@ -10,8 +10,7 @@ void SDL::LST::eventSetup() { SDL::initModules(path); } -void SDL::LST::run(cudaStream_t stream, - bool verbose, +void SDL::LST::run(bool verbose, const std::vector see_px, const std::vector see_py, const std::vector see_pz, @@ -31,7 +30,7 @@ void SDL::LST::run(cudaStream_t stream, const std::vector ph2_x, const std::vector ph2_y, const std::vector ph2_z) { - auto event = SDL::Event(stream, verbose); + auto event = SDL::Event(verbose); prepareInput(see_px, see_py, see_pz, diff --git a/SDL/LST.h b/SDL/LST.h index 1315a3ae..3225194a 100644 --- a/SDL/LST.h +++ b/SDL/LST.h @@ -17,8 +17,7 @@ namespace SDL { LST(); void eventSetup(); - void run(cudaStream_t stream, - bool verbose, + void run(bool verbose, const std::vector see_px, const std::vector see_py, const std::vector see_pz, diff --git a/SDL/Makefile b/SDL/Makefile index abc9a160..6ac11fb1 100644 --- a/SDL/Makefile +++ b/SDL/Makefile @@ -16,36 +16,29 @@ LIB=libsdl.so # flags to keep track of # -# AMD Opteron and Intel EM64T (64 bit mode) Linux with gcc 3.x -CXX = nvcc -CXXFLAGS = -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I.. -ROOTCFLAGS = --compiler-options -pthread --compiler-options -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include -ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr -DALPAKA_DEBUG=0 +CXX = g++ +CXXFLAGS = -g -Wall -Wshadow -std=c++17 -Woverloaded-virtual -fPIC -fopenmp -I.. +CXXFLAGS_CUDA = -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I.. +ROOTCFLAGS = -pthread -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -std=c++17 +ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr LD = nvcc SOFLAGS = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72 PRINTFLAG = -DT4FromT3 #-DWarnings DUPLICATES = -DDUP_pLS -DDUP_T5 -DDUP_pT5 -DDUP_pT3 -DCrossclean_T5 -DCrossclean_pT3 -DFP16_Base -DFP16_dPhi MEMFLAG = CACHEFLAG = -CUDALAUNCHFLAG = MEMFLAG_FLAGS = CACHEFLAG_FLAGS = -DCACHE_ALLOC -CUDALAUNCHFLAG_FLAGS = -PT0P8 = -PRELOAD = -CMSSW12GEOM = -T3T3EXTENSION= -# -# how to make it -# + CUTVALUEFLAG = CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG + %_cuda.o : %.cu %.cuh - $(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@ + $(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@ %_cpu.o : %.cc %.h - $(LD) -O2 $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ + $(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ $(LIB):$(CCOBJECTS) $(CUOBJECTS) #$(LIB):$(CUOBJECTS) diff --git a/SDL/Module.cuh b/SDL/Module.cuh index 1015031c..225b147d 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -606,7 +606,6 @@ namespace SDL uint16_t& nModules, uint16_t& nLowerModules, struct pixelMap& pixelMapping, - cudaStream_t stream, TQueue& queue, const char* moduleMetaDataFilePath) { @@ -656,8 +655,8 @@ namespace SDL if(count_number>4) break; } - } + (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list counter++; nModules = counter; @@ -814,13 +813,20 @@ namespace SDL } } - cudaMemcpyAsync(modulesInGPU->nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU->nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); + auto src_view_nModules = alpaka::createView(devHost, &nModules, (Idx) 1u); + alpaka::memcpy(queue, modulesBuf->nModules_buf, src_view_nModules); + + auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (Idx) 1u); + alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules); + alpaka::wait(queue); + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaStream_t stream = 0; cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream); cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream); cudaStreamSynchronize(stream); +#endif //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules); //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules); diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 033dea78..0cf22db4 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -568,7 +568,7 @@ namespace SDL } float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius; - float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0); + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f); float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError)); float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError)); @@ -589,7 +589,7 @@ namespace SDL } float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius; - float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0); + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f); float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError)); float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError)); @@ -610,11 +610,11 @@ namespace SDL } float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius; - float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0); + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f); float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError)); float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError)); - pixelRadiusInvMin = alpaka::math::max(acc, pixelRadiusInvMin, 0); + pixelRadiusInvMin = alpaka::math::max(acc, pixelRadiusInvMin, 0.0f); return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); }; @@ -632,11 +632,11 @@ namespace SDL } float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius; - float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0); + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f); float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError)); float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError)); - pixelRadiusInvMin = alpaka::math::max(acc, 0, pixelRadiusInvMin); + pixelRadiusInvMin = alpaka::math::max(acc, 0.0f, pixelRadiusInvMin); return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); }; diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index 6bf87ca0..25588ed2 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -538,8 +538,8 @@ namespace SDL float diffz1 = (solz1-zsi)*100; float diffz2 = (solz2-zsi)*100; // Alpaka : Needs to be moved over - if (isnan(diffz1)) diffz = diffz2; - else if (isnan(diffz2)) diffz = diffz1; + if (alpaka::math::isnan(acc, diffz1)) diffz = diffz2; + else if (alpaka::math::isnan(acc, diffz2)) diffz = diffz1; else {diffz = (alpaka::math::abs(acc, diffz1)6) ? diffr : diffz ; @@ -584,7 +584,7 @@ namespace SDL // for set rzchi2 cut // if the 5 points are linear, helix calculation gives nan // Alpaka : Needs to be moved over - if (inner_pt > 100 || isnan(rzChiSquared)) + if (inner_pt > 100 || alpaka::math::isnan(acc, rzChiSquared)) { float slope; if(moduleType1 == 0 and moduleType2 == 0 and moduleType3 == 1) //PSPS2S diff --git a/bin/sdl.cc b/bin/sdl.cc index 74ff1fd4..eeb82242 100644 --- a/bin/sdl.cc +++ b/bin/sdl.cc @@ -382,13 +382,10 @@ void run_sdl() file_name.push_back(ana.looper.getCurrentFileName()); } - cudaStream_t streams[ana.streams]; std::vector events; for (int s = 0; s < ana.streams; s++) { - - cudaStreamCreateWithFlags(&streams[s], cudaStreamNonBlocking); - SDL::Event *event = new SDL::Event(streams[s],ana.verbose>=2); + SDL::Event *event = new SDL::Event(ana.verbose>=2); events.push_back(event); } @@ -517,7 +514,6 @@ void run_sdl() for (int s = 0; s < ana.streams; s++) { delete events.at(s); - cudaStreamDestroy(streams[s]); } delete ana.output_tfile; diff --git a/setup_cgpu.sh b/setup_cgpu.sh index 2a5392dd..a30c0bf8 100644 --- a/setup_cgpu.sh +++ b/setup_cgpu.sh @@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.5.0/etc/profile.d/init.sh -export BOOST_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/boost/1.72.0-ghbfee3" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.7.0-09bef105568314b218f2a8410a876785" +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh +export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" #eof From 62e3dc329bd0d05c6f4e02184bc01a35fd8c333f Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 13:54:06 -0700 Subject: [PATCH 31/44] move to more recent alpaka version --- setup_cgpu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup_cgpu.sh b/setup_cgpu.sh index a30c0bf8..fbff025e 100644 --- a/setup_cgpu.sh +++ b/setup_cgpu.sh @@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" #eof From 386d40a6d621c3504387ca4d9b36d2542f174b0d Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 14:09:50 -0700 Subject: [PATCH 32/44] remove last cuda --- SDL/Module.cuh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/SDL/Module.cuh b/SDL/Module.cuh index 225b147d..1855aee0 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -819,17 +819,8 @@ namespace SDL auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (Idx) 1u); alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules); - alpaka::wait(queue); - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaStream_t stream = 0; - cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream); - cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream); - cudaStreamSynchronize(stream); -#endif - - //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules); - //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules); + alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf); + alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf); alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf, nModules); alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf, nModules); From 03948d77ae9958dcf94dea79165e11cc27fa31d4 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 18:56:41 -0700 Subject: [PATCH 33/44] beginning integration of cmssw alpaka interface/caching allocator --- SDL/Constants.cuh | 7 + SDL/Event.cu | 26 +- SDL/Makefile | 9 +- code/alpaka_interface/AllocatorConfig.h | 31 ++ code/alpaka_interface/AllocatorPolicy.h | 53 +++ code/alpaka_interface/AlpakaServiceFwd.h | 33 ++ code/alpaka_interface/CachedBufAlloc.h | 207 +++++++++ code/alpaka_interface/CachingAllocator.h | 436 ++++++++++++++++++ code/alpaka_interface/HostOnlyTask.h | 71 +++ code/alpaka_interface/ScopedContextFwd.h | 35 ++ code/alpaka_interface/config.h | 164 +++++++ code/alpaka_interface/devices.h | 43 ++ .../getDeviceCachingAllocator.h | 88 ++++ .../getHostCachingAllocator.h | 32 ++ code/alpaka_interface/host.h | 29 ++ code/alpaka_interface/memory.h | 247 ++++++++++ code/alpaka_interface/stringize.h | 8 + code/alpaka_interface/thread_safety_macros.h | 12 + code/alpaka_interface/traits.h | 69 +++ 19 files changed, 1583 insertions(+), 17 deletions(-) create mode 100644 code/alpaka_interface/AllocatorConfig.h create mode 100644 code/alpaka_interface/AllocatorPolicy.h create mode 100644 code/alpaka_interface/AlpakaServiceFwd.h create mode 100644 code/alpaka_interface/CachedBufAlloc.h create mode 100644 code/alpaka_interface/CachingAllocator.h create mode 100644 code/alpaka_interface/HostOnlyTask.h create mode 100644 code/alpaka_interface/ScopedContextFwd.h create mode 100644 code/alpaka_interface/config.h create mode 100644 code/alpaka_interface/devices.h create mode 100644 code/alpaka_interface/getDeviceCachingAllocator.h create mode 100644 code/alpaka_interface/getHostCachingAllocator.h create mode 100644 code/alpaka_interface/host.h create mode 100644 code/alpaka_interface/memory.h create mode 100644 code/alpaka_interface/stringize.h create mode 100644 code/alpaka_interface/thread_safety_macros.h create mode 100644 code/alpaka_interface/traits.h diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 606fe7f0..68981441 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -2,6 +2,7 @@ #define Constants_cuh #include +#include "../code/alpaka_interface/CachedBufAlloc.h" // CUDA headers. Will be removed soon. #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -103,11 +104,17 @@ using QueueAcc = alpaka::Queue; template using Buf = alpaka::Buf; +template +ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TAcc const & devAccIn, TSize nElements, TQueue queue) { + return cms::alpakatools::allocCachedBuf(devAccIn, queue, Vec1d(static_cast(nElements))); +} + template ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TAcc const & devAccIn, TSize nElements) { return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); } + const unsigned int MAX_BLOCKS = 80; const unsigned int MAX_CONNECTED_MODULES = 40; const unsigned int N_MAX_PIXEL_MD_PER_MODULES = 100000; diff --git a/SDL/Event.cu b/SDL/Event.cu index aee9e862..e20c9a01 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -319,11 +319,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::wait(queue); } - auto hitIndices0_dev = allocBufWrapper(devAcc, size); - auto hitIndices1_dev = allocBufWrapper(devAcc, size); - auto hitIndices2_dev = allocBufWrapper(devAcc, size); - auto hitIndices3_dev = allocBufWrapper(devAcc, size); - auto dPhiChange_dev = allocBufWrapper(devAcc, size); + auto hitIndices0_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices1_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices2_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices3_dev = allocBufWrapper(devAcc, size, queue); + auto dPhiChange_dev = allocBufWrapper(devAcc, size, queue); alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size); alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size); @@ -623,7 +623,7 @@ void SDL::Event::createTriplets() uint16_t *index = alpaka::getPtrNative(index_buf); // Allocate device index - auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules); + auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules, queue); // Allocate and copy nSegments from device to host auto nSegments_buf = allocBufWrapper(devHost, nLowerModules); @@ -648,7 +648,7 @@ void SDL::Event::createTriplets() index[nonZeroModules] = innerLowerModuleIndex; nonZeroModules++; } - max_InnerSeg = max(max_InnerSeg, nInnerSegments); + max_InnerSeg = std::max(max_InnerSeg, nInnerSegments); } // Copy index from host to device @@ -741,7 +741,7 @@ void SDL::Event::createTrackCandidates() alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask); Vec const threadsPerBlockRemoveDupQuints(static_cast(1), static_cast(16), static_cast(32)); - Vec const blocksPerGridRemoveDupQuints(static_cast(1), static_cast(max(nEligibleModules/16,1)), static_cast(max(nEligibleModules/32,1))); + Vec const blocksPerGridRemoveDupQuints(static_cast(1), static_cast(std::max(nEligibleModules/16,1)), static_cast(std::max(nEligibleModules/32,1))); WorkDiv const removeDupQuintupletsInGPUBeforeTC_workDiv(blocksPerGridRemoveDupQuints, threadsPerBlockRemoveDupQuints, elementsPerThread); SDL::removeDupQuintupletsInGPUBeforeTC removeDupQuintupletsInGPUBeforeTC_kernel; @@ -859,8 +859,8 @@ void SDL::Event::createPixelTriplets() auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); - auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments); - auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); int* superbins = alpaka::getPtrNative(superbins_buf); int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); @@ -994,7 +994,7 @@ void SDL::Event::createQuintuplets() } Vec const threadsPerBlockQuints(static_cast(1), static_cast(8), static_cast(32)); - Vec const blocksPerGridQuints(static_cast(max(nEligibleT5Modules,1)), static_cast(1), static_cast(1)); + Vec const blocksPerGridQuints(static_cast(std::max((int) nEligibleT5Modules, 1)), static_cast(1), static_cast(1)); WorkDiv const createQuintupletsInGPUv2_workDiv(blocksPerGridQuints, threadsPerBlockQuints, elementsPerThread); SDL::createQuintupletsInGPUv2 createQuintupletsInGPUv2_kernel; @@ -1097,8 +1097,8 @@ void SDL::Event::createPixelQuintuplets() auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); - auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments); - auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); int* superbins = alpaka::getPtrNative(superbins_buf); int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); diff --git a/SDL/Makefile b/SDL/Makefile index 6ac11fb1..0c1a1cdf 100644 --- a/SDL/Makefile +++ b/SDL/Makefile @@ -20,8 +20,9 @@ CXX = g++ CXXFLAGS = -g -Wall -Wshadow -std=c++17 -Woverloaded-virtual -fPIC -fopenmp -I.. CXXFLAGS_CUDA = -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I.. ROOTCFLAGS = -pthread -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include -ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -std=c++17 -ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include --std=c++17 +ALPAKASERIAL = -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +ALPAKACUDA = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr LD = nvcc SOFLAGS = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72 PRINTFLAG = -DT4FromT3 #-DWarnings @@ -35,10 +36,10 @@ CUTVALUEFLAG = CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG %_cuda.o : %.cu %.cuh - $(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@ + $(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKACUDA) $< -o $@ %_cpu.o : %.cc %.h - $(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@ + $(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $(ALPAKASERIAL) $< -o $@ $(LIB):$(CCOBJECTS) $(CUOBJECTS) #$(LIB):$(CUOBJECTS) diff --git a/code/alpaka_interface/AllocatorConfig.h b/code/alpaka_interface/AllocatorConfig.h new file mode 100644 index 00000000..83b5214a --- /dev/null +++ b/code/alpaka_interface/AllocatorConfig.h @@ -0,0 +1,31 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h +#define HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h + +#include + +namespace cms::alpakatools { + + namespace config { + + // bin growth factor (bin_growth in cub::CachingDeviceAllocator) + constexpr unsigned int binGrowth = 2; + + // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CachingDeviceAllocator + constexpr unsigned int minBin = 8; // 256 bytes + + // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. + constexpr unsigned int maxBin = 30; // 1 GB + + // total storage for the allocator; 0 means no limit. + constexpr size_t maxCachedBytes = 0; + + // fraction of total device memory taken for the allocator; 0 means no limit. + constexpr double maxCachedFraction = 0.8; + + // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used. + + } // namespace config + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h diff --git a/code/alpaka_interface/AllocatorPolicy.h b/code/alpaka_interface/AllocatorPolicy.h new file mode 100644 index 00000000..16bf3652 --- /dev/null +++ b/code/alpaka_interface/AllocatorPolicy.h @@ -0,0 +1,53 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h +#define HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h + +#include + +#include "traits.h" + +namespace cms::alpakatools { + + // Which memory allocator to use + // - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost + // - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2) + // - Caching: (device and host) caching allocator + enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; + + template >> + constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; + +#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + +#if defined ALPAKA_ACC_GPU_CUDA_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + AllocatorPolicy::Asynchronous; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#if defined ALPAKA_ACC_GPU_HIP_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h diff --git a/code/alpaka_interface/AlpakaServiceFwd.h b/code/alpaka_interface/AlpakaServiceFwd.h new file mode 100644 index 00000000..4345f3f3 --- /dev/null +++ b/code/alpaka_interface/AlpakaServiceFwd.h @@ -0,0 +1,33 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h +#define HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h + +// Forward declaration of the alpaka accelerator namespaces and of the AlpakaService for each of them. +// +// This file is under HeterogeneousCore/AlpakaInterface to avoid introducing a dependency on +// HeterogeneousCore/AlpakaServices and HeterogeneousCore/AlpakaCore. + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +namespace alpaka_cuda_async { + class AlpakaService; +} // namespace alpaka_cuda_async +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +namespace alpaka_hip_async { + class AlpakaService; +} // namespace alpaka_hip_async +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +namespace alpaka_serial_sync { + class AlpakaService; +} // namespace alpaka_serial_sync +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +namespace alpaka_tbb_async { + class AlpakaService; +} // namespace alpaka_tbb_async +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + +#endif // HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h diff --git a/code/alpaka_interface/CachedBufAlloc.h b/code/alpaka_interface/CachedBufAlloc.h new file mode 100644 index 00000000..c5d7eec3 --- /dev/null +++ b/code/alpaka_interface/CachedBufAlloc.h @@ -0,0 +1,207 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h +#define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h + +#include + +#include "getDeviceCachingAllocator.h" +#include "getHostCachingAllocator.h" +#include "traits.h" + +namespace cms::alpakatools { + + namespace traits { + + //! The caching memory allocator trait. + template and cms::alpakatools::is_queue_v>> + struct CachedBufAlloc { + static_assert(alpaka::meta::DependentFalseType::value, "This device does not support a caching allocator"); + }; + + //! The caching memory allocator implementation for the CPU device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCpu { + // non-cached, queue-ordered asynchronous host-only memory + return alpaka::allocAsyncBuf(queue, extent); + } + }; + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + + //! The caching memory allocator implementation for the pinned host memory, with a blocking queue + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueCudaRtBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueCudaRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the CUDA device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCudaRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCudaRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + + //! The caching memory allocator implementation for the pinned host memory, with a blocking queue + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueHipRtBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueHipRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the ROCm/HIP device + template >> + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufHipRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufHipRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace traits + + template and cms::alpakatools::is_queue_v>> + ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) { + return traits::CachedBufAlloc, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent); + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h new file mode 100644 index 00000000..72a52694 --- /dev/null +++ b/code/alpaka_interface/CachingAllocator.h @@ -0,0 +1,436 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h +#define HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "traits.h" +#include "AlpakaServiceFwd.h" + +// Inspired by cub::CachingDeviceAllocator + +namespace cms::alpakatools { + + namespace detail { + + inline constexpr unsigned int power(unsigned int base, unsigned int exponent) { + unsigned int power = 1; + while (exponent > 0) { + if (exponent & 1) { + power = power * base; + } + base = base * base; + exponent = exponent >> 1; + } + return power; + } + + // format a memory size in B/kB/MB/GB + inline std::string as_bytes(size_t value) { + if (value == std::numeric_limits::max()) { + return "unlimited"; + } else if (value >= (1 << 30) and value % (1 << 30) == 0) { + return std::to_string(value >> 30) + " GB"; + } else if (value >= (1 << 20) and value % (1 << 20) == 0) { + return std::to_string(value >> 20) + " MB"; + } else if (value >= (1 << 10) and value % (1 << 10) == 0) { + return std::to_string(value >> 10) + " kB"; + } else { + return std::to_string(value) + " B"; + } + } + + } // namespace detail + + /* + * The "memory device" identifies the memory space, i.e. the device where the memory is allocated. + * A caching allocator object is associated to a single memory `Device`, set at construction time, and unchanged for + * the lifetime of the allocator. + * + * Each allocation is associated to an event on a queue, that identifies the "synchronisation device" according to + * which the synchronisation occurs. + * The `Event` type depends only on the synchronisation `Device` type. + * The `Queue` type depends on the synchronisation `Device` type and the queue properties, either `Sync` or `Async`. + * + * **Note**: how to handle different queue and event types in a single allocator ? store and access type-punned + * queues and events ? or template the internal structures on them, but with a common base class ? + * alpaka does rely on the compile-time type for dispatch. + * + * Common use case #1: accelerator's memory allocations + * - the "memory device" is the accelerator device (e.g. a GPU); + * - the "synchronisation device" is the same accelerator device; + * - the `Queue` type is usually always the same (either `Sync` or `Async`). + * + * Common use case #2: pinned host memory allocations + * - the "memory device" is the host device (e.g. system memory); + * - the "synchronisation device" is the accelerator device (e.g. a GPU) whose work queue will access the host; + * memory (direct memory access from the accelerator, or scheduling `alpaka::memcpy`/`alpaka::memset`), and can + * be different for each allocation; + * - the synchronisation `Device` _type_ could potentially be different, but memory pinning is currently tied to + * the accelerator's platform (CUDA, HIP, etc.), so the device type needs to be fixed to benefit from caching; + * - the `Queue` type can be either `Sync` _or_ `Async` on any allocation. + */ + + template and cms::alpakatools::is_queue_v>> + class CachingAllocator { + public: +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + friend class alpaka_cuda_async::AlpakaService; +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + friend class alpaka_hip_async::AlpakaService; +#endif +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + friend class alpaka_serial_sync::AlpakaService; +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + friend class alpaka_tbb_async::AlpakaService; +#endif + + using Device = TDev; // the "memory device", where the memory will be allocated + using Queue = TQueue; // the queue used to submit the memory operations + using Event = alpaka::Event; // the events used to synchronise the operations + using Buffer = alpaka::Buf, size_t>; + + // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the " + "host CPU."); + + struct CachedBytes { + size_t free = 0; // total bytes freed and cached on this device + size_t live = 0; // total bytes currently in use oin this device + size_t requested = 0; // total bytes requested and currently in use on this device + }; + + explicit CachingAllocator( + Device const& device, + unsigned int binGrowth, // bin growth factor; + unsigned int minBin, // smallest bin, corresponds to binGrowth^minBin bytes; + // smaller allocations are rounded to this value; + unsigned int maxBin, // largest bin, corresponds to binGrowth^maxBin bytes; + // larger allocations will fail; + size_t maxCachedBytes, // total storage for the allocator (0 means no limit); + double maxCachedFraction, // fraction of total device memory taken for the allocator (0 means no limit); + // if both maxCachedBytes and maxCachedFraction are non-zero, + // the smallest resulting value is used. + bool reuseSameQueueAllocations, // reuse non-ready allocations if they are in the same queue as the new one; + // this is safe only if all memory operations are scheduled in the same queue + bool debug) + : device_(device), + binGrowth_(binGrowth), + minBin_(minBin), + maxBin_(maxBin), + minBinBytes_(detail::power(binGrowth, minBin)), + maxBinBytes_(detail::power(binGrowth, maxBin)), + maxCachedBytes_(cacheSize(maxCachedBytes, maxCachedFraction)), + reuseSameQueueAllocations_(reuseSameQueueAllocations), + debug_(debug) { + if (debug_) { + std::ostringstream out; + out << "CachingAllocator settings\n" + << " bin growth " << binGrowth_ << "\n" + << " min bin " << minBin_ << "\n" + << " max bin " << maxBin_ << "\n" + << " resulting bins:\n"; + for (auto bin = minBin_; bin <= maxBin_; ++bin) { + auto binSize = detail::power(binGrowth, bin); + out << " " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n'; + } + out << " maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_); + std::cout << out.str() << std::endl; + } + } + + ~CachingAllocator() { + { + // this should never be called while some memory blocks are still live + std::scoped_lock lock(mutex_); + assert(liveBlocks_.empty()); + assert(cachedBytes_.live == 0); + } + + freeAllCached(); + } + + // return a copy of the cache allocation status, for monitoring purposes + CachedBytes cacheStatus() const { + std::scoped_lock lock(mutex_); + return cachedBytes_; + } + + // Allocate given number of bytes on the current device associated to given queue + void* allocate(size_t bytes, Queue queue) { + // create a block descriptor for the requested allocation + BlockDescriptor block; + block.queue = std::move(queue); + block.requested = bytes; + std::tie(block.bin, block.bytes) = findBin(bytes); + + // try to re-use a cached block, or allocate a new buffer + if (not tryReuseCachedBlock(block)) { + allocateNewBlock(block); + } + + return block.buffer->data(); + } + + // frees an allocation + void free(void* ptr) { + std::scoped_lock lock(mutex_); + + auto iBlock = liveBlocks_.find(ptr); + if (iBlock == liveBlocks_.end()) { + std::stringstream ss; + ss << "Trying to free a non-live block at " << ptr; + throw std::runtime_error(ss.str()); + } + // remove the block from the list of live blocks + BlockDescriptor block = std::move(iBlock->second); + liveBlocks_.erase(iBlock); + cachedBytes_.live -= block.bytes; + cachedBytes_.requested -= block.requested; + + bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_); + if (recache) { + alpaka::enqueue(*(block.queue), *(block.event)); + cachedBytes_.free += block.bytes; + // after the call to insert(), cachedBlocks_ shares ownership of the buffer + // TODO use std::move ? + cachedBlocks_.insert(std::make_pair(block.bin, block)); + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at " + << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" + << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; + std::cout << out.str() << std::endl; + } + } else { + // if the buffer is not recached, it is automatically freed when block goes out of scope + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at " + << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" + << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; + std::cout << out.str() << std::endl; + } + } + } + + private: + struct BlockDescriptor { + std::optional buffer; + std::optional queue; + std::optional event; + size_t bytes = 0; + size_t requested = 0; // for monitoring only + unsigned int bin = 0; + + // the "synchronisation device" for this block + auto device() { return alpaka::getDev(*queue); } + }; + + private: + // return the maximum amount of memory that should be cached on this device + size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const { + // note that getMemBytes() returns 0 if the platform does not support querying the device memory + size_t totalMemory = alpaka::getMemBytes(device_); + size_t memoryFraction = static_cast(maxCachedFraction * totalMemory); + size_t size = std::numeric_limits::max(); + if (maxCachedBytes > 0 and maxCachedBytes < size) { + size = maxCachedBytes; + } + if (memoryFraction > 0 and memoryFraction < size) { + size = memoryFraction; + } + return size; + } + + // return (bin, bin size) + std::tuple findBin(size_t bytes) const { + if (bytes < minBinBytes_) { + return std::make_tuple(minBin_, minBinBytes_); + } + if (bytes > maxBinBytes_) { + throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + + " bytes is too large for the caching detail with maximum bin " + + std::to_string(maxBinBytes_) + + " bytes. You might want to increase the maximum bin size"); + } + unsigned int bin = minBin_; + size_t binBytes = minBinBytes_; + while (binBytes < bytes) { + ++bin; + binBytes *= binGrowth_; + } + return std::make_tuple(bin, binBytes); + } + + bool tryReuseCachedBlock(BlockDescriptor& block) { + std::scoped_lock lock(mutex_); + + // iterate through the range of cached blocks in the same bin + const auto [begin, end] = cachedBlocks_.equal_range(block.bin); + for (auto iBlock = begin; iBlock != end; ++iBlock) { + if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or + alpaka::isComplete(*(iBlock->second.event))) { + // associate the cached buffer to the new queue + auto queue = std::move(*(block.queue)); + // TODO cache (or remove) the debug information and use std::move() + block = iBlock->second; + block.queue = std::move(queue); + + // if the new queue is on different device than the old event, create a new event + if (block.device() != alpaka::getDev(*(block.event))) { + block.event = Event{block.device()}; + } + + // insert the cached block into the live blocks + // TODO cache (or remove) the debug information and use std::move() + liveBlocks_[block.buffer->data()] = block; + + // update the accounting information + cachedBytes_.free -= block.bytes; + cachedBytes_.live += block.bytes; + cachedBytes_.requested += block.requested; + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at " + << block.buffer->data() << " (" << block.bytes << " bytes) for queue " + << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() + << " (previously associated with queue " << iBlock->second.queue->m_spQueueImpl.get() << " , event " + << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl; + std::cout << out.str() << std::endl; + } + + // remove the reused block from the list of cached blocks + cachedBlocks_.erase(iBlock); + return true; + } + } + + return false; + } + + Buffer allocateBuffer(size_t bytes, Queue const& queue) { + if constexpr (std::is_same_v>) { + // allocate device memory + return alpaka::allocBuf(device_, bytes); + } else if constexpr (std::is_same_v) { + // allocate pinned host memory + return alpaka::allocMappedBuf(device_, alpaka::getDev(queue), bytes); + } else { + // unsupported combination + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be " + "the host CPU."); + } + } + + void allocateNewBlock(BlockDescriptor& block) { + try { + block.buffer = allocateBuffer(block.bytes, *block.queue); + } catch (std::runtime_error const& e) { + // the allocation attempt failed: free all cached blocks on the device and retry + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes + << " bytes for queue " << block.queue->m_spQueueImpl.get() + << ", retrying after freeing cached allocations" << std::endl; + std::cout << out.str() << std::endl; + } + // TODO implement a method that frees only up to block.bytes bytes + freeAllCached(); + + // throw an exception if it fails again + block.buffer = allocateBuffer(block.bytes, *block.queue); + } + + // create a new event associated to the "synchronisation device" + block.event = Event{block.device()}; + + { + std::scoped_lock lock(mutex_); + cachedBytes_.live += block.bytes; + cachedBytes_.requested += block.requested; + // TODO use std::move() ? + liveBlocks_[block.buffer->data()] = block; + } + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at " + << block.buffer->data() << " (" << block.bytes << " bytes associated with queue " + << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl; + std::cout << out.str() << std::endl; + } + } + + void freeAllCached() { + std::scoped_lock lock(mutex_); + + while (not cachedBlocks_.empty()) { + auto iBlock = cachedBlocks_.begin(); + cachedBytes_.free -= iBlock->second.bytes; + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes + << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free + << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding." + << std::endl; + std::cout << out.str() << std::endl; + } + + cachedBlocks_.erase(iBlock); + } + } + + // TODO replace with a tbb::concurrent_multimap ? + using CachedBlocks = std::multimap; // ordered by the allocation bin + // TODO replace with a tbb::concurrent_map ? + using BusyBlocks = std::map; // ordered by the address of the allocated memory + + inline static const std::string deviceType_ = alpaka::core::demangled; + + mutable std::mutex mutex_; + Device device_; // the device where the memory is allocated + + CachedBytes cachedBytes_; + CachedBlocks cachedBlocks_; // Set of cached device allocations available for reuse + BusyBlocks liveBlocks_; // map of pointers to the live device allocations currently in use + + const unsigned int binGrowth_; // Geometric growth factor for bin-sizes + const unsigned int minBin_; + const unsigned int maxBin_; + + const size_t minBinBytes_; + const size_t maxBinBytes_; + const size_t maxCachedBytes_; // Maximum aggregate cached bytes per device + + const bool reuseSameQueueAllocations_; + const bool debug_; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h diff --git a/code/alpaka_interface/HostOnlyTask.h b/code/alpaka_interface/HostOnlyTask.h new file mode 100644 index 00000000..fc07921e --- /dev/null +++ b/code/alpaka_interface/HostOnlyTask.h @@ -0,0 +1,71 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h +#define HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h + +#include +#include + +#include + +namespace alpaka { + + //! A task that is guaranted not to call any GPU-ralated APIs + //! + //! These tasks can be enqueued directly to the native GPU queues, without the use of a + //! dedicated host-side worker thread. + class HostOnlyTask { + public: + HostOnlyTask(std::function task) : task_(std::move(task)) {} + + void operator()() const { task_(); } + + private: + std::function task_; + }; + + namespace trait { + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + //! The CUDA async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + using TApi = ApiCudaRt; + + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + //! The HIP async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + using TApi = ApiHipRt; + + static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace trait + +} // namespace alpaka + +#endif // HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h diff --git a/code/alpaka_interface/ScopedContextFwd.h b/code/alpaka_interface/ScopedContextFwd.h new file mode 100644 index 00000000..206824aa --- /dev/null +++ b/code/alpaka_interface/ScopedContextFwd.h @@ -0,0 +1,35 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h +#define HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h + +#include "traits.h" + +// Forward declaration of the alpaka framework Context classes +// +// This file is under HeterogeneousCore/AlpakaInterface to avoid introducing a dependency on +// HeterogeneousCore/AlpakaCore. + +namespace cms::alpakatools { + + namespace impl { + template >> + class ScopedContextBase; + + template >> + class ScopedContextGetterBase; + } // namespace impl + + template >> + class ScopedContextAcquire; + + template >> + class ScopedContextProduce; + + template >> + class ScopedContextTask; + + template >> + class ScopedContextAnalyze; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h diff --git a/code/alpaka_interface/config.h b/code/alpaka_interface/config.h new file mode 100644 index 00000000..354a93b9 --- /dev/null +++ b/code/alpaka_interface/config.h @@ -0,0 +1,164 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_config_h +#define HeterogeneousCore_AlpakaInterface_interface_config_h + +#include + +#include + +#include "stringize.h" + +namespace alpaka_common { + + // common types and dimensions + using Idx = uint32_t; + using Extent = uint32_t; + using Offsets = Extent; + + using Dim0D = alpaka::DimInt<0u>; + using Dim1D = alpaka::DimInt<1u>; + using Dim2D = alpaka::DimInt<2u>; + using Dim3D = alpaka::DimInt<3u>; + + template + using Vec = alpaka::Vec; + using Vec1D = Vec; + using Vec2D = Vec; + using Vec3D = Vec; + using Scalar = Vec; + + template + using WorkDiv = alpaka::WorkDivMembers; + using WorkDiv1D = WorkDiv; + using WorkDiv2D = WorkDiv; + using WorkDiv3D = WorkDiv; + + // host types + using DevHost = alpaka::DevCpu; + using PltfHost = alpaka::Pltf; + +} // namespace alpaka_common + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +namespace alpaka_cuda_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfCudaRt; + using Device = alpaka::DevCudaRt; + using Queue = alpaka::QueueCudaRtNonBlocking; + using Event = alpaka::EventCudaRt; + + template + using Acc = alpaka::AccGpuCudaRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_cuda_async + +#ifdef ALPAKA_ACCELERATOR_NAMESPACE +#define ALPAKA_DUPLICATE_NAMESPACE +#else +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async +#define ALPAKA_TYPE_SUFFIX CudaAsync +#endif + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +namespace alpaka_hip_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::QueueHipRtNonBlocking; + using Event = alpaka::EventHipRt; + + template + using Acc = alpaka::AccGpuHipRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_hip_async + +#ifdef ALPAKA_ACCELERATOR_NAMESPACE +#define ALPAKA_DUPLICATE_NAMESPACE +#else +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_hip_async +#define ALPAKA_TYPE_SUFFIX HipAsync +#endif + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +namespace alpaka_serial_sync { + using namespace alpaka_common; + + using Platform = alpaka::PltfCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::QueueCpuBlocking; + using Event = alpaka::EventCpu; + + template + using Acc = alpaka::AccCpuSerial; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_serial_sync + +#ifdef ALPAKA_ACCELERATOR_NAMESPACE +#define ALPAKA_DUPLICATE_NAMESPACE +#else +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync +#define ALPAKA_TYPE_SUFFIX SerialSync +#endif + +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +namespace alpaka_tbb_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::QueueCpuNonBlocking; + using Event = alpaka::EventCpu; + + template + using Acc = alpaka::AccCpuTbbBlocks; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_tbb_async + +#ifdef ALPAKA_ACCELERATOR_NAMESPACE +#define ALPAKA_DUPLICATE_NAMESPACE +#else +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async +#define ALPAKA_TYPE_SUFFIX TbbAsync +#endif + +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + +#if defined ALPAKA_DUPLICATE_NAMESPACE +#error Only one alpaka backend symbol can be defined at the same time: ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_GPU_HIP_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED. +#endif + +#if defined ALPAKA_ACCELERATOR_NAMESPACE + +// create a new backend-specific identifier based on the original type name and a backend-specific suffix +#define ALPAKA_TYPE_ALIAS__(TYPE, SUFFIX) TYPE##SUFFIX +#define ALPAKA_TYPE_ALIAS_(TYPE, SUFFIX) ALPAKA_TYPE_ALIAS__(TYPE, SUFFIX) +#define ALPAKA_TYPE_ALIAS(TYPE) ALPAKA_TYPE_ALIAS_(TYPE, ALPAKA_TYPE_SUFFIX) + +// declare the backend-specific identifier as an alias for the namespace-based type name +#define DECLARE_ALPAKA_TYPE_ALIAS(TYPE) using ALPAKA_TYPE_ALIAS(TYPE) = ALPAKA_ACCELERATOR_NAMESPACE::TYPE + +// define a null-terminated string containing the backend-specific identifier +#define ALPAKA_TYPE_ALIAS_NAME(TYPE) EDM_STRINGIZE(ALPAKA_TYPE_ALIAS(TYPE)) + +#endif // ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaInterface_interface_config_h diff --git a/code/alpaka_interface/devices.h b/code/alpaka_interface/devices.h new file mode 100644 index 00000000..24630ece --- /dev/null +++ b/code/alpaka_interface/devices.h @@ -0,0 +1,43 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_devices_h +#define HeterogeneousCore_AlpakaInterface_interface_devices_h + +#include +#include + +#include + +#include "config.h" +#include "traits.h" + +namespace cms::alpakatools { + + namespace detail { + + template >> + inline std::vector> enumerate_devices() { + using Platform = TPlatform; + using Device = alpaka::Dev; + + std::vector devices; + uint32_t n = alpaka::getDevCount(); + devices.reserve(n); + for (uint32_t i = 0; i < n; ++i) { + devices.push_back(alpaka::getDevByIdx(i)); + assert(alpaka::getNativeHandle(devices.back()) == static_cast(i)); + } + + return devices; + } + + } // namespace detail + + // return the alpaka accelerator devices for the given platform + template >> + inline std::vector> const& devices() { + static const auto devices = detail::enumerate_devices(); + return devices; + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_devices_h diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h new file mode 100644 index 00000000..94e0e7cc --- /dev/null +++ b/code/alpaka_interface/getDeviceCachingAllocator.h @@ -0,0 +1,88 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h +#define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h + +#include + +#include "thread_safety_macros.h" +#include "AllocatorConfig.h" +#include "CachingAllocator.h" +#include "devices.h" +#include "traits.h" + +namespace cms::alpakatools { + + namespace detail { + + template and cms::alpakatools::is_queue_v>> + auto allocate_device_allocators() { + using Allocator = CachingAllocator; + auto const& devices = cms::alpakatools::devices>(); + ssize_t const size = devices.size(); + + // allocate the storage for the objects + auto ptr = std::allocator().allocate(size); + + // construct the objects in the storage + ptrdiff_t index = 0; + try { + for (; index < size; ++index) { +#if __cplusplus >= 202002L + std::construct_at( +#else + std::allocator().construct( +#endif + ptr + index, + devices[index], + config::binGrowth, + config::minBin, + config::maxBin, + config::maxCachedBytes, + config::maxCachedFraction, + true, // reuseSameQueueAllocations + false); // debug + } + } catch (...) { + --index; + // destroy any object that had been succesfully constructed + while (index >= 0) { + std::destroy_at(ptr + index); + --index; + } + // deallocate the storage + std::allocator().deallocate(ptr, size); + // rethrow the exception + throw; + } + + // use a custom deleter to destroy all objects and deallocate the memory + auto deleter = [size](Allocator* ptr) { + for (size_t i = size; i > 0; --i) { + std::destroy_at(ptr + i - 1); + } + std::allocator().deallocate(ptr, size); + }; + + return std::unique_ptr(ptr, deleter); + } + + } // namespace detail + + template and cms::alpakatools::is_queue_v>> + inline CachingAllocator& getDeviceCachingAllocator(TDev const& device) { + // initialise all allocators, one per device + CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators(); + + size_t const index = alpaka::getNativeHandle(device); + assert(index < cms::alpakatools::devices>().size()); + + // the public interface is thread safe + return allocators[index]; + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h diff --git a/code/alpaka_interface/getHostCachingAllocator.h b/code/alpaka_interface/getHostCachingAllocator.h new file mode 100644 index 00000000..2ffa1871 --- /dev/null +++ b/code/alpaka_interface/getHostCachingAllocator.h @@ -0,0 +1,32 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h +#define HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h + +#include "thread_safety_macros.h" +#include "AllocatorConfig.h" +#include "CachingAllocator.h" +#include "config.h" +#include "host.h" +#include "traits.h" + +namespace cms::alpakatools { + + template >> + inline CachingAllocator& getHostCachingAllocator() { + // thread safe initialisation of the host allocator + CMS_THREAD_SAFE static CachingAllocator allocator( + host(), + config::binGrowth, + config::minBin, + config::maxBin, + config::maxCachedBytes, + config::maxCachedFraction, + false, // reuseSameQueueAllocations + false); // debug + + // the public interface is thread safe + return allocator; + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h diff --git a/code/alpaka_interface/host.h b/code/alpaka_interface/host.h new file mode 100644 index 00000000..0303313d --- /dev/null +++ b/code/alpaka_interface/host.h @@ -0,0 +1,29 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_host_h +#define HeterogeneousCore_AlpakaInterface_interface_host_h + +namespace cms::alpakatools { + + namespace detail { + + inline alpaka::DevCpu enumerate_host() { + using Platform = alpaka::PltfCpu; + using Host = alpaka::DevCpu; + + assert(alpaka::getDevCount() == 1); + Host host = alpaka::getDevByIdx(0); + assert(alpaka::getNativeHandle(host) == 0); + + return host; + } + + } // namespace detail + + // returns the alpaka host device + static inline alpaka::DevCpu const& host() { + static const auto host = detail::enumerate_host(); + return host; + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_host_h diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h new file mode 100644 index 00000000..cbdc6fc0 --- /dev/null +++ b/code/alpaka_interface/memory.h @@ -0,0 +1,247 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_memory_h +#define HeterogeneousCore_AlpakaInterface_interface_memory_h + +#include + +#include + +#include "AllocatorPolicy.h" +#include "CachedBufAlloc.h" +#include "config.h" +#include "traits.h" + +namespace cms::alpakatools { + + // for Extent, Dim1D, Idx + using namespace alpaka_common; + + // type deduction helpers + namespace detail { + + template >> + struct buffer_type { + using type = alpaka::Buf; + }; + + template + struct buffer_type { + using type = alpaka::Buf; + }; + + template + struct buffer_type { + using type = alpaka::Buf; + }; + + template >> + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + template + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + template + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + } // namespace detail + + // scalar and 1-dimensional host buffers + + template + using host_buffer = typename detail::buffer_type::type; + + template + using const_host_buffer = alpaka::ViewConst>; + + // non-cached, non-pinned, scalar and 1-dimensional host buffers + + template + std::enable_if_t, host_buffer> make_host_buffer() { + return alpaka::allocBuf(host(), Scalar{}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer(Extent extent) { + return alpaka::allocBuf, Idx>(host(), Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer() { + return alpaka::allocBuf, Idx>(host(), Vec1D{std::extent_v}); + } + + // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue + // the memory is pinned according to the device associated to the queue + + template + std::enable_if_t and not std::is_array_v, host_buffer> make_host_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf(host(), queue, Scalar{}); + } else { + return alpaka::allocMappedBuf(host(), alpaka::getDev(queue), Scalar{}); + } + } + + template + std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + host_buffer> + make_host_buffer(TQueue const& queue, Extent extent) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(host(), queue, Vec1D{extent}); + } else { + return alpaka::allocMappedBuf, Idx>(host(), alpaka::getDev(queue), Vec1D{extent}); + } + } + + template + std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + host_buffer> + make_host_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(host(), queue, Vec1D{std::extent_v}); + } else { + return alpaka::allocMappedBuf, Idx>( + host(), alpaka::getDev(queue), Vec1D{std::extent_v}); + } + } + + // scalar and 1-dimensional host views + + template + using host_view = typename detail::view_type::type; + + template + std::enable_if_t, host_view> make_host_view(T& data) { + return alpaka::ViewPlainPtr(&data, host(), Scalar{}); + } + + template + host_view make_host_view(T* data, Extent extent) { + return alpaka::ViewPlainPtr(data, host(), Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_view> + make_host_view(T& data, Extent extent) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host(), Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_view> + make_host_view(T& data) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host(), Vec1D{std::extent_v}); + } + + // scalar and 1-dimensional device buffers + + template >> + using device_buffer = typename detail::buffer_type::type; + + template >> + using const_device_buffer = alpaka::ViewConst>; + + // non-cached, scalar and 1-dimensional device buffers + + template + std::enable_if_t and not std::is_array_v, device_buffer> make_device_buffer( + TDev const& device) { + return alpaka::allocBuf(device, Scalar{}); + } + + template + std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + device_buffer> + make_device_buffer(TDev const& device, Extent extent) { + return alpaka::allocBuf, Idx>(device, Vec1D{extent}); + } + + template + std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + device_buffer> + make_device_buffer(TDev const& device) { + return alpaka::allocBuf, Idx>(device, Vec1D{std::extent_v}); + } + + // potentially-cached, scalar and 1-dimensional device buffers with queue-ordered semantic + + template + std::enable_if_t and not std::is_array_v, device_buffer, T>> + make_device_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf(alpaka::getDev(queue), queue, Scalar{}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf(queue, Scalar{}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf(alpaka::getDev(queue), Scalar{}); + } + } + + template + std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + device_buffer, T>> + make_device_buffer(TQueue const& queue, Extent extent) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(alpaka::getDev(queue), queue, Vec1D{extent}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{extent}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{extent}); + } + } + + template + std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + device_buffer, T>> + make_device_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(alpaka::getDev(queue), queue, Vec1D{std::extent_v}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{std::extent_v}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{std::extent_v}); + } + } + + // scalar and 1-dimensional device views + + template >> + using device_view = typename detail::view_type::type; + + template + std::enable_if_t, device_view> make_device_view(TDev const& device, T& data) { + return alpaka::ViewPlainPtr(&data, device, Scalar{}); + } + + template + device_view make_device_view(TDev const& device, T* data, Extent extent) { + return alpaka::ViewPlainPtr(data, device, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, device_view> + make_device_view(TDev const& device, T& data, Extent extent) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, device_view> + make_device_view(TDev const& device, T& data) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{std::extent_v}); + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_memory_h diff --git a/code/alpaka_interface/stringize.h b/code/alpaka_interface/stringize.h new file mode 100644 index 00000000..549d5cbc --- /dev/null +++ b/code/alpaka_interface/stringize.h @@ -0,0 +1,8 @@ +#ifndef FWCore_Utilities_interface_stringize_h +#define FWCore_Utilities_interface_stringize_h + +// convert the macro argument to a null-terminated quoted string +#define EDM_STRINGIZE_(token) #token +#define EDM_STRINGIZE(token) EDM_STRINGIZE_(token) + +#endif // FWCore_Utilities_interface_stringize_h diff --git a/code/alpaka_interface/thread_safety_macros.h b/code/alpaka_interface/thread_safety_macros.h new file mode 100644 index 00000000..3abbe0b9 --- /dev/null +++ b/code/alpaka_interface/thread_safety_macros.h @@ -0,0 +1,12 @@ +#ifndef FWCore_Utilites_thread_safe_macros_h +#define FWCore_Utilites_thread_safe_macros_h +#if !defined __CLING__ && !defined __INTEL_COMPILER && !defined __NVCC__ +#define CMS_THREAD_SAFE [[cms::thread_safe]] +#define CMS_SA_ALLOW [[cms::sa_allow]] +#define CMS_THREAD_GUARD(_var_) [[cms::thread_guard(#_var_)]] +#else +#define CMS_THREAD_SAFE +#define CMS_SA_ALLOW +#define CMS_THREAD_GUARD(_var_) +#endif +#endif diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h new file mode 100644 index 00000000..8235a416 --- /dev/null +++ b/code/alpaka_interface/traits.h @@ -0,0 +1,69 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_traits_h +#define HeterogeneousCore_AlpakaInterface_interface_traits_h + +#include + +#if __cplusplus >= 202002L +namespace cms { + using std::is_bounded_array; + using std::is_unbounded_array; +} // namespace cms +#else +#include +#include +namespace cms { + using boost::is_bounded_array; + using boost::is_unbounded_array; +} // namespace cms +#endif + +namespace cms { + template + inline constexpr bool is_bounded_array_v = is_bounded_array::value; + + template + inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; +} // namespace cms + +#include + +namespace cms::alpakatools { + + // is_platform + + template + struct is_platform + : std::integral_constant::value> {}; + + template + constexpr bool is_platform_v = is_platform::value; + + // is_device + + template + struct is_device : std::integral_constant::value> {}; + + template + constexpr bool is_device_v = is_device::value; + + // is_accelerator + + template + struct is_accelerator + : std::integral_constant::value> {}; + + template + constexpr bool is_accelerator_v = is_accelerator::value; + + // is_queue + + template + struct is_queue : std::integral_constant::value> { + }; + + template + constexpr bool is_queue_v = is_queue::value; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_traits_h From 8105c8dd3f1f7dbdfa8bf9ec449d96c3668df84f Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 19:12:10 -0700 Subject: [PATCH 34/44] bring back caching allocator toggle --- SDL/Constants.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 68981441..d3b698cd 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -106,7 +106,11 @@ using Buf = alpaka::Buf; template ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TAcc const & devAccIn, TSize nElements, TQueue queue) { +#ifdef CACHE_ALLOC return cms::alpakatools::allocCachedBuf(devAccIn, queue, Vec1d(static_cast(nElements))); +#else + return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); +#endif } template From ad2048bca755a528ba9fe36126a062f15a79eb2c Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 20:17:57 -0700 Subject: [PATCH 35/44] full alpaka caching allocator --- SDL/Event.cu | 104 ++++++++++++++++++++--------------------- SDL/Hit.cuh | 38 +++++++-------- SDL/MiniDoublet.cuh | 70 +++++++++++++-------------- SDL/Module.cuh | 50 ++++++++++---------- SDL/PixelTriplet.cuh | 82 ++++++++++++++++---------------- SDL/Quintuplet.cuh | 48 +++++++++---------- SDL/Segment.cuh | 68 +++++++++++++-------------- SDL/TrackCandidate.cuh | 28 +++++------ SDL/Triplet.cuh | 56 +++++++++++----------- setup_cgpu.sh | 4 +- 10 files changed, 274 insertions(+), 274 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index e20c9a01..36e09d15 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -172,7 +172,7 @@ void SDL::Event::addHitToEvent(std::vector x, std::vector y, std:: const int nHits = x.size(); // Needed for the memcpy to hitsInGPU below. Will be replaced with a View. - auto nHits_buf = allocBufWrapper(devHost, 1); + auto nHits_buf = allocBufWrapper(devHost, 1, queue); *alpaka::getPtrNative(nHits_buf) = nHits; // Initialize space on device/host for next event. @@ -390,16 +390,16 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st void SDL::Event::addMiniDoubletsToEventExplicit() { - auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules); + auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules); - auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); - auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules*2); + auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules*2, queue); alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2); alpaka::wait(queue); @@ -427,13 +427,13 @@ void SDL::Event::addMiniDoubletsToEventExplicit() void SDL::Event::addSegmentsToEventExplicit() { - auto nSegmentsCPU_buf = allocBufWrapper(devHost, nLowerModules); + auto nSegmentsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules); - auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); alpaka::wait(queue); @@ -484,7 +484,7 @@ void SDL::Event::createMiniDoublets() alpaka::enqueue(queue, createMDArrayRangesGPUTask); alpaka::wait(queue); - auto nTotalMDs_buf = allocBufWrapper(devHost, 1); + auto nTotalMDs_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf, 1); alpaka::wait(queue); @@ -602,7 +602,7 @@ void SDL::Event::createTriplets() alpaka::wait(queue); // TODO: Why are we pulling this back down only to put it back on the device in a new struct? - auto maxTriplets_buf = allocBufWrapper(devHost, 1); + auto maxTriplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf, 1); alpaka::wait(queue); @@ -619,21 +619,21 @@ void SDL::Event::createTriplets() unsigned int max_InnerSeg = 0; // Allocate host index - auto index_buf = allocBufWrapper(devHost, nLowerModules); + auto index_buf = allocBufWrapper(devHost, nLowerModules, queue); uint16_t *index = alpaka::getPtrNative(index_buf); // Allocate device index auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules, queue); // Allocate and copy nSegments from device to host - auto nSegments_buf = allocBufWrapper(devHost, nLowerModules); + auto nSegments_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules); alpaka::wait(queue); int *nSegments = alpaka::getPtrNative(nSegments_buf); // Allocate and copy module_nConnectedModules from device to host - auto module_nConnectedModules_buf = allocBufWrapper(devHost, nLowerModules); + auto module_nConnectedModules_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nLowerModules); alpaka::wait(queue); @@ -704,7 +704,7 @@ void SDL::Event::createTrackCandidates() } // Pull nEligibleT5Modules from the device. - auto nEligibleModules_buf = allocBufWrapper(devHost, 1); + auto nEligibleModules_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1); uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf); @@ -850,15 +850,15 @@ void SDL::Event::createPixelTriplets() alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); alpaka::wait(queue); - auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); alpaka::wait(queue); - auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); - auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); @@ -931,7 +931,7 @@ void SDL::Event::createPixelTriplets() alpaka::wait(queue); #ifdef Warnings - auto nPixelTriplets_buf = allocBufWrapper(devHost, 1); + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); alpaka::wait(queue); @@ -973,8 +973,8 @@ void SDL::Event::createQuintuplets() alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask); alpaka::wait(queue); - auto nEligibleT5Modules_buf = allocBufWrapper(devHost, 1); - auto nTotalQuintuplets_buf = allocBufWrapper(devHost, 1); + auto nEligibleT5Modules_buf = allocBufWrapper(devHost, 1, queue); + auto nTotalQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf, 1); alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf, 1); @@ -1088,15 +1088,15 @@ void SDL::Event::createPixelQuintuplets() alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); alpaka::wait(queue); - auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); - auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE); + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE); alpaka::wait(queue); - auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments); - auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments); + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); @@ -1197,7 +1197,7 @@ void SDL::Event::createPixelQuintuplets() alpaka::wait(queue); #ifdef Warnings - auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1); + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); alpaka::wait(queue); @@ -1208,16 +1208,16 @@ void SDL::Event::createPixelQuintuplets() void SDL::Event::addQuintupletsToEventExplicit() { - auto nQuintupletsCPU_buf = allocBufWrapper(devHost, nLowerModules); + auto nQuintupletsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules); - auto module_subdets_buf = allocBufWrapper(devHost, nModules); + auto module_subdets_buf = allocBufWrapper(devHost, nModules, queue); alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nModules); - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); - auto module_quintupletModuleIndices_buf = allocBufWrapper(devHost, nLowerModules); + auto module_quintupletModuleIndices_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules); alpaka::wait(queue); @@ -1245,13 +1245,13 @@ void SDL::Event::addQuintupletsToEventExplicit() void SDL::Event::addTripletsToEventExplicit() { - auto nTripletsCPU_buf = allocBufWrapper(devHost, nLowerModules); + auto nTripletsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf, nLowerModules); - auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules); + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules); + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); alpaka::wait(queue); @@ -1409,7 +1409,7 @@ unsigned int SDL::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer) int SDL::Event::getNumberOfPixelTriplets() { - auto nPixelTriplets_buf = allocBufWrapper(devHost, 1); + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); alpaka::wait(queue); @@ -1421,7 +1421,7 @@ int SDL::Event::getNumberOfPixelTriplets() int SDL::Event::getNumberOfPixelQuintuplets() { - auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1); + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); alpaka::wait(queue); @@ -1466,7 +1466,7 @@ unsigned int SDL::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer) int SDL::Event::getNumberOfTrackCandidates() { - auto nTrackCandidates_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); alpaka::wait(queue); @@ -1478,7 +1478,7 @@ int SDL::Event::getNumberOfTrackCandidates() int SDL::Event::getNumberOfPT5TrackCandidates() { - auto nTrackCandidatesPT5_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidatesPT5_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf, 1); alpaka::wait(queue); @@ -1490,7 +1490,7 @@ int SDL::Event::getNumberOfPT5TrackCandidates() int SDL::Event::getNumberOfPT3TrackCandidates() { - auto nTrackCandidatesPT3_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidatesPT3_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf, 1); alpaka::wait(queue); @@ -1502,7 +1502,7 @@ int SDL::Event::getNumberOfPT3TrackCandidates() int SDL::Event::getNumberOfPLSTrackCandidates() { - auto nTrackCandidatesPLS_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidatesPLS_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf, 1); alpaka::wait(queue); @@ -1514,8 +1514,8 @@ int SDL::Event::getNumberOfPLSTrackCandidates() int SDL::Event::getNumberOfPixelTrackCandidates() { - auto nTrackCandidates_buf = allocBufWrapper(devHost, 1); - auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1); @@ -1529,7 +1529,7 @@ int SDL::Event::getNumberOfPixelTrackCandidates() int SDL::Event::getNumberOfT5TrackCandidates() { - auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1); + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1); alpaka::wait(queue); @@ -1543,7 +1543,7 @@ SDL::hitsBuffer* SDL::Event::getHits() //std::shared_ptr should { if(hitsInCPU == nullptr) { - auto nHits_buf = allocBufWrapper(devHost, 1); + auto nHits_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1); alpaka::wait(queue); @@ -1567,7 +1567,7 @@ SDL::hitsBuffer* SDL::Event::getHitsInCMSSW() { if(hitsInCPU == nullptr) { - auto nHits_buf = allocBufWrapper(devHost, 1); + auto nHits_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1); alpaka::wait(queue); @@ -1604,7 +1604,7 @@ SDL::miniDoubletsBuffer* SDL::Event::getMiniDoublets() if(mdsInCPU == nullptr) { // Get nMemoryLocations parameter to initialize host based mdsInCPU - auto nMemLocal_buf = allocBufWrapper(devHost, 1); + auto nMemLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nMemLocal_buf, miniDoubletsBuffers->nMemoryLocations_buf, 1); alpaka::wait(queue); @@ -1628,7 +1628,7 @@ SDL::segmentsBuffer* SDL::Event::getSegments() if(segmentsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based segmentsInCPU - auto nMemLocal_buf = allocBufWrapper(devHost, 1); + auto nMemLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nMemLocal_buf, segmentsBuffers->nMemoryLocations_buf, 1); alpaka::wait(queue); @@ -1659,7 +1659,7 @@ SDL::tripletsBuffer* SDL::Event::getTriplets() if(tripletsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based tripletsInCPU - auto nMemLocal_buf = allocBufWrapper(devHost, 1); + auto nMemLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nMemLocal_buf, tripletsBuffers->nMemoryLocations_buf, 1); alpaka::wait(queue); @@ -1700,7 +1700,7 @@ SDL::quintupletsBuffer* SDL::Event::getQuintuplets() if(quintupletsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based quintupletsInCPU - auto nMemLocal_buf = allocBufWrapper(devHost, 1); + auto nMemLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nMemLocal_buf, quintupletsBuffers->nMemoryLocations_buf, 1); alpaka::wait(queue); @@ -1733,7 +1733,7 @@ SDL::pixelTripletsBuffer* SDL::Event::getPixelTriplets() if(pixelTripletsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based quintupletsInCPU - auto nPixelTriplets_buf = allocBufWrapper(devHost, 1); + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1); alpaka::wait(queue); @@ -1764,7 +1764,7 @@ SDL::pixelQuintupletsBuffer* SDL::Event::getPixelQuintuplets() if(pixelQuintupletsInCPU == nullptr) { // Get nMemoryLocations parameter to initilize host based quintupletsInCPU - auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1); + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1); alpaka::wait(queue); @@ -1791,7 +1791,7 @@ SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidates() if(trackCandidatesInCPU == nullptr) { // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU - auto nTrackLocal_buf = allocBufWrapper(devHost, 1); + auto nTrackLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); alpaka::wait(queue); @@ -1816,7 +1816,7 @@ SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidatesInCMSS if(trackCandidatesInCPU == nullptr) { // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU - auto nTrackLocal_buf = allocBufWrapper(devHost, 1); + auto nTrackLocal_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1); alpaka::wait(queue); diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh index 8e69bc96..df348127 100644 --- a/SDL/Hit.cuh +++ b/SDL/Hit.cuh @@ -81,25 +81,25 @@ namespace SDL unsigned int nMaxHits, TDevAcc const & devAccIn, TQueue& queue) : - nHits_buf(allocBufWrapper(devAccIn, 1)), - xs_buf(allocBufWrapper(devAccIn, nMaxHits)), - ys_buf(allocBufWrapper(devAccIn, nMaxHits)), - zs_buf(allocBufWrapper(devAccIn, nMaxHits)), - moduleIndices_buf(allocBufWrapper(devAccIn, nMaxHits)), - idxs_buf(allocBufWrapper(devAccIn, nMaxHits)), - detid_buf(allocBufWrapper(devAccIn, nMaxHits)), - rts_buf(allocBufWrapper(devAccIn, nMaxHits)), - phis_buf(allocBufWrapper(devAccIn, nMaxHits)), - etas_buf(allocBufWrapper(devAccIn, nMaxHits)), - highEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits)), - highEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits)), - lowEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits)), - lowEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits)), - hitRanges_buf(allocBufWrapper(devAccIn, nModules*2)), - hitRangesLower_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules)), - hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules)) + nHits_buf(allocBufWrapper(devAccIn, 1u, queue)), + xs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + ys_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + zs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + idxs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + detid_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + rts_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + phis_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + etas_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + highEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + highEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + lowEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + lowEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + hitRanges_buf(allocBufWrapper(devAccIn, nModules*2, queue)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules, queue)) { alpaka::memset(queue, hitRanges_buf, -1, nModules*2); alpaka::memset(queue, hitRangesLower_buf, -1, nModules); diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 74897eb3..1574a662 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -141,41 +141,41 @@ namespace SDL uint16_t nLowerModules, TDevAcc const & devAccIn, TQueue& queue) : - nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), - anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - nMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - totOccupancyMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1)), - dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - dzs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - dphis_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerZ_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerRt_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerEta_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc)), - outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc)) + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + nMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1, queue)), + totOccupancyMDs_buf(allocBufWrapper(devAccIn, nLowerModules+1, queue)), + dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + dzs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + dphis_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerZ_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerRt_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerEta_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)) { alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1); alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1); diff --git a/SDL/Module.cuh b/SDL/Module.cuh index 1855aee0..d4d38af7 100644 --- a/SDL/Module.cuh +++ b/SDL/Module.cuh @@ -150,31 +150,31 @@ namespace SDL unsigned int nLowerMod, TDevAcc const & devAccIn, TQueue& queue) : - hitRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - hitRangesLower_buf(allocBufWrapper(devAccIn, nMod)), - hitRangesUpper_buf(allocBufWrapper(devAccIn, nMod)), - hitRangesnLower_buf(allocBufWrapper(devAccIn, nMod)), - hitRangesnUpper_buf(allocBufWrapper(devAccIn, nMod)), - mdRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - segmentRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - trackletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - tripletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - trackCandidateRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - quintupletRanges_buf(allocBufWrapper(devAccIn, nMod*2)), - nEligibleT5Modules_buf(allocBufWrapper(devAccIn, 1)), - indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerMod)), - quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod)), - quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod)), - miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1)), - miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1)), - segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1)), - segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1)), - tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod)), - tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod)), - device_nTotalMDs_buf(allocBufWrapper(devAccIn, 1)), - device_nTotalSegs_buf(allocBufWrapper(devAccIn, 1)), - device_nTotalTrips_buf(allocBufWrapper(devAccIn, 1)), - device_nTotalQuints_buf(allocBufWrapper(devAccIn, 1)) + hitRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nMod, queue)), + mdRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + segmentRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + trackletRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + tripletRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + trackCandidateRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + quintupletRanges_buf(allocBufWrapper(devAccIn, nMod*2, queue)), + nEligibleT5Modules_buf(allocBufWrapper(devAccIn, 1, queue)), + indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1, queue)), + miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1, queue)), + segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod+1, queue)), + segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod+1, queue)), + tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + device_nTotalMDs_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalSegs_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalTrips_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalQuints_buf(allocBufWrapper(devAccIn, 1, queue)) { alpaka::memset(queue, hitRanges_buf, -1, nMod*2); alpaka::memset(queue, hitRangesLower_buf, -1, nMod); diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index 0cf22db4..a8e76c2e 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -101,29 +101,29 @@ namespace SDL pixelTripletsBuffer(unsigned int maxPixelTriplets, TDevAcc const & devAccIn, TQueue& queue) : - pixelSegmentIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - tripletIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - nPixelTriplets_buf(allocBufWrapper(devAccIn, 1)), - totOccupancyPixelTriplets_buf(allocBufWrapper(devAccIn, 1)), - pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - tripletRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - pt_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - eta_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - phi_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - eta_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - phi_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - score_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - isDup_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5)), - hitIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*10)), - lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5)), - centerX_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - centerY_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - pixelRadiusError_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelTriplets)), - rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets)) + pixelSegmentIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + tripletIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + nPixelTriplets_buf(allocBufWrapper(devAccIn, 1, queue)), + totOccupancyPixelTriplets_buf(allocBufWrapper(devAccIn, 1, queue)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + tripletRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + pt_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + eta_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + phi_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*10, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets*5, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + pixelRadiusError_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)) { alpaka::memset(queue, nPixelTriplets_buf, 0, 1); alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0, 1); @@ -1511,24 +1511,24 @@ namespace SDL pixelQuintupletsBuffer(unsigned int maxPixelQuintuplets, TDevAcc const & devAccIn, TQueue& queue) : - pixelIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - T5Indices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - nPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1)), - totOccupancyPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1)), - isDup_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - score_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - eta_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - phi_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7)), - hitIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*14)), - lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7)), - pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - quintupletRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - centerX_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - centerY_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)), - rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets)) + pixelIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + T5Indices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + nPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1, queue)), + totOccupancyPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*14, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets*7, queue)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + quintupletRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)) { alpaka::memset(queue, nPixelQuintuplets_buf, 0, 1); alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0, 1); diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index 25588ed2..e29d8bde 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -107,30 +107,30 @@ namespace SDL unsigned int nLowerModules, TDevAcc const & devAccIn, TQueue& queue) : - tripletIndices_buf(allocBufWrapper(devAccIn, 2 * nTotalQuintuplets)), - lowerModuleIndices_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets)), - nQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules)), - totOccupancyQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules)), - nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), - innerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - bridgeRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - outerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - pt_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - eta_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - phi_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - score_rphisum_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - layer_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - TightCutFlag_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - partOfPT5_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - regressionRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - regressionG_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - regressionF_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - logicalLayers_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets)), - hitIndices_buf(allocBufWrapper(devAccIn, 10 * nTotalQuintuplets)), - rzChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - chiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)), - nonAnchorChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets)) + tripletIndices_buf(allocBufWrapper(devAccIn, 2 * nTotalQuintuplets, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets, queue)), + nQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + totOccupancyQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + innerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + bridgeRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + outerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + pt_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + eta_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + phi_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + score_rphisum_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + layer_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + TightCutFlag_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionG_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionF_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, 10 * nTotalQuintuplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + chiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + nonAnchorChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)) { alpaka::memset(queue, nQuintuplets_buf, 0, nLowerModules); alpaka::memset(queue, totOccupancyQuintuplets_buf, 0, nLowerModules); diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index c2f9aef5..ab8dd13d 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -130,40 +130,40 @@ namespace SDL unsigned int maxPixelSegments, TDevAcc const & devAccIn, TQueue& queue) : - dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - dPhiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - dPhiChangeMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - dPhiChangeMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn*2)), - innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn)), - nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), - nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), - totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1)), - charge_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - ptIn_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - ptErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - px_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - py_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - pz_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - etaErr_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - eta_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - phi_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - score_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - circleCenterX_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments)), - circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments)) + dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChangeMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChangeMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn*2, queue)), + innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + charge_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + ptIn_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + ptErr_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + px_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + py_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + pz_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + etaErr_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleCenterX_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)) { alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1); alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1); diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index 9abaa754..9bdfd799 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -80,20 +80,20 @@ namespace SDL trackCandidatesBuffer(unsigned int maxTrackCandidates, TDevAcc const & devAccIn, TQueue& queue) : - trackCandidateType_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), - directObjectIndices_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), - objectIndices_buf(allocBufWrapper(devAccIn, 2 * maxTrackCandidates)), - nTrackCandidates_buf(allocBufWrapper(devAccIn, 1)), - nTrackCandidatespT3_buf(allocBufWrapper(devAccIn, 1)), - nTrackCandidatespT5_buf(allocBufWrapper(devAccIn, 1)), - nTrackCandidatespLS_buf(allocBufWrapper(devAccIn, 1)), - nTrackCandidatesT5_buf(allocBufWrapper(devAccIn, 1)), - logicalLayers_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates)), - hitIndices_buf(allocBufWrapper(devAccIn, 14 * maxTrackCandidates)), - pixelSeedIndex_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), - lowerModuleIndices_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates)), - centerX_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), - centerY_buf(allocBufWrapper(devAccIn, maxTrackCandidates)), + trackCandidateType_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + directObjectIndices_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + objectIndices_buf(allocBufWrapper(devAccIn, 2 * maxTrackCandidates, queue)), + nTrackCandidates_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespT3_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespT5_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespLS_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatesT5_buf(allocBufWrapper(devAccIn, 1, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, 14 * maxTrackCandidates, queue)), + pixelSeedIndex_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), radius_buf(allocBufWrapper(devAccIn, maxTrackCandidates)) { alpaka::memset(queue, nTrackCandidates_buf, 0, 1); diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 045e2ee2..08c59d34 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -120,35 +120,35 @@ namespace SDL unsigned int nLowerModules, TDevAcc const & devAccIn, TQueue& queue) : - segmentIndices_buf(allocBufWrapper(devAccIn, 2 * maxTriplets)), - lowerModuleIndices_buf(allocBufWrapper(devAccIn, 3 * maxTriplets)), - nTriplets_buf(allocBufWrapper(devAccIn, nLowerModules)), - totOccupancyTriplets_buf(allocBufWrapper(devAccIn, nLowerModules)), - nMemoryLocations_buf(allocBufWrapper(devAccIn, 1)), - logicalLayers_buf(allocBufWrapper(devAccIn, maxTriplets * 3)), - hitIndices_buf(allocBufWrapper(devAccIn, maxTriplets * 6)), - betaIn_buf(allocBufWrapper(devAccIn, maxTriplets)), - betaOut_buf(allocBufWrapper(devAccIn, maxTriplets)), - pt_beta_buf(allocBufWrapper(devAccIn, maxTriplets)), - partOfPT5_buf(allocBufWrapper(devAccIn, maxTriplets)), - partOfT5_buf(allocBufWrapper(devAccIn, maxTriplets)), - partOfPT3_buf(allocBufWrapper(devAccIn, maxTriplets)) + segmentIndices_buf(allocBufWrapper(devAccIn, 2 * maxTriplets, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 3 * maxTriplets, queue)), + nTriplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + totOccupancyTriplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxTriplets * 3, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxTriplets * 6, queue)), + betaIn_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + betaOut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + pt_beta_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfT5_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfPT3_buf(allocBufWrapper(devAccIn, maxTriplets, queue)) #ifdef CUT_VALUE_DEBUG - ,zOut_buf(allocBufWrapper(devAccIn, maxTriplets)), - rtOut_buf(allocBufWrapper(devAccIn, maxTriplets)), - deltaPhiPos_buf(allocBufWrapper(devAccIn, maxTriplets)), - deltaPhi_buf(allocBufWrapper(devAccIn, maxTriplets)), - zLo_buf(allocBufWrapper(devAccIn, maxTriplets)), - zHi_buf(allocBufWrapper(devAccIn, maxTriplets)), - zLoPointed_buf(allocBufWrapper(devAccIn, maxTriplets)), - zHiPointed_buf(allocBufWrapper(devAccIn, maxTriplets)), - sdlCut_buf(allocBufWrapper(devAccIn, maxTriplets)), - betaInCut_buf(allocBufWrapper(devAccIn, maxTriplets)), - betaOutCut_buf(allocBufWrapper(devAccIn, maxTriplets)), - deltaBetaCut_buf(allocBufWrapper(devAccIn, maxTriplets)), - rtLo_buf(allocBufWrapper(devAccIn, maxTriplets)), - rtHi_buf(allocBufWrapper(devAccIn, maxTriplets)), - kZ_buf(allocBufWrapper(devAccIn, maxTriplets)) + ,zOut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtOut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + deltaPhiPos_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + deltaPhi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zLo_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zHi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zLoPointed_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zHiPointed_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + sdlCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + betaInCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + betaOutCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + deltaBetaCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtLo_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtHi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + kZ_buf(allocBufWrapper(devAccIn, maxTriplets, queue)) #endif { alpaka::memset(queue, nTriplets_buf, 0, nLowerModules); diff --git a/setup_cgpu.sh b/setup_cgpu.sh index fbff025e..a30c0bf8 100644 --- a/setup_cgpu.sh +++ b/setup_cgpu.sh @@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" #eof From 8d907e47958e0b3c83f86b1290a42b3b21156989 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Tue, 27 Jun 2023 20:34:01 -0700 Subject: [PATCH 36/44] cleanup --- SDL/TrackCandidate.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index 9bdfd799..250f4558 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -94,7 +94,7 @@ namespace SDL lowerModuleIndices_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates, queue)), centerX_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), centerY_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), - radius_buf(allocBufWrapper(devAccIn, maxTrackCandidates)) + radius_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)) { alpaka::memset(queue, nTrackCandidates_buf, 0, 1); alpaka::memset(queue, nTrackCandidatesT5_buf, 0, 1); From dd3b92a84e12c2f838bae57d3bf0d9417902671a Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 5 Jul 2023 12:10:47 -0400 Subject: [PATCH 37/44] setup for lnx7188 --- README.md | 6 ++++++ setup_lnx7188.sh | 14 ++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a6b8d96c..8d6d967a 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,12 @@ For lnx7188 this needs to be done once cd /cdat/tem/${USER}/ git clone git@github.com:SegmentLinking/LSTPerformanceWeb.git +### Setting up container (only for lnx7188) + +For lnx7188 this needs to be done before compiling or running the code: + + singularity shell --nv --bind /mnt/data1:/data --bind /data2/segmentlinking/ --bind /opt --bind /nfs --bind /mnt --bind /cvmfs /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cmssw/el8:x86_64 + ### Running the code git clone --recursive git@github.com:SegmentLinking/TrackLooper.git diff --git a/setup_lnx7188.sh b/setup_lnx7188.sh index 73c42f60..a56a41c3 100644 --- a/setup_lnx7188.sh +++ b/setup_lnx7188.sh @@ -5,8 +5,11 @@ ########################################################################################################### DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $DIR/code/rooutil/thisrooutil.sh -export SCRAM_ARCH=slc7_amd64_gcc900 -export CMSSW_VERSION=CMSSW_11_2_0_pre5 + +export SCRAM_ARCH=el8_amd64_gcc10 +export CMSSW_VERSION=CMSSW_12_5_0_pre5 +export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/ + source /cvmfs/cms.cern.ch/cmsset_default.sh cd /cvmfs/cms.cern.ch/$SCRAM_ARCH/cms/cmssw/$CMSSW_VERSION/src eval `scramv1 runtime -sh` @@ -14,7 +17,6 @@ cd - > /dev/null echo "Setup following ROOT. Make sure it's slc7 variant. Otherwise the looper won't compile." which root -export CUDA_HOME=/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/cuda/11.0.3/ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export LD_LIBRARY_PATH=$DIR:$LD_LIBRARY_PATH export PATH=$DIR/bin:$PATH @@ -34,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/cdat/tem/${USER}/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.5.0/etc/profile.d/init.sh -export BOOST_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/boost/1.72.0-ghbfee3" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.7.0-09bef105568314b218f2a8410a876785" +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh +export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" #eof From aaf1e60127f60b51ea27002690eb14059e443fcd Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Wed, 5 Jul 2023 18:31:02 -0400 Subject: [PATCH 38/44] fix caching allocator bug --- SDL/Event.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/SDL/Event.cu b/SDL/Event.cu index 36e09d15..98fef91e 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -706,6 +706,7 @@ void SDL::Event::createTrackCandidates() // Pull nEligibleT5Modules from the device. auto nEligibleModules_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1); + alpaka::wait(queue); uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf); Vec const threadsPerBlock_crossCleanpT3(static_cast(1), static_cast(16), static_cast(64)); From 54c1103dd9ab2f6ed15895ba92f7b5a79c8355a9 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Mon, 10 Jul 2023 13:48:42 -0400 Subject: [PATCH 39/44] move to most recent cmssw alpaka interface + newer alpaka version --- code/alpaka_interface/CachingAllocator.h | 4 +- code/alpaka_interface/TransferToHost.h | 21 ++ .../getDeviceCachingAllocator.h | 1 + code/alpaka_interface/host.h | 4 +- code/alpaka_interface/memory.h | 29 +- code/alpaka_interface/traits.h | 19 +- code/alpaka_interface/vec.h | 42 +++ code/alpaka_interface/workdivision.h | 266 ++++++++++++++++++ setup_cgpu.sh | 6 +- setup_lnx7188.sh | 6 +- 10 files changed, 374 insertions(+), 24 deletions(-) create mode 100644 code/alpaka_interface/TransferToHost.h create mode 100644 code/alpaka_interface/vec.h create mode 100644 code/alpaka_interface/workdivision.h diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h index 72a52694..0a0dee82 100644 --- a/code/alpaka_interface/CachingAllocator.h +++ b/code/alpaka_interface/CachingAllocator.h @@ -336,8 +336,8 @@ namespace cms::alpakatools { // allocate device memory return alpaka::allocBuf(device_, bytes); } else if constexpr (std::is_same_v) { - // allocate pinned host memory - return alpaka::allocMappedBuf(device_, alpaka::getDev(queue), bytes); + // allocate pinned host memory accessible by the queue's platform + return alpaka::allocMappedBuf>, std::byte, size_t>(device_, bytes); } else { // unsupported combination static_assert(std::is_same_v> or std::is_same_v, diff --git a/code/alpaka_interface/TransferToHost.h b/code/alpaka_interface/TransferToHost.h new file mode 100644 index 00000000..e6bacef1 --- /dev/null +++ b/code/alpaka_interface/TransferToHost.h @@ -0,0 +1,21 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_TransferToHost_h +#define HeterogeneousCore_AlpakaInterface_interface_TransferToHost_h + +// TODO: better package? + +namespace cms::alpakatools { + // TODO: would a more informative error message from compiler than "indeterminate type" be helpful? + template + struct TransferToHost; + + // specialization expected to define + // using HostDataType = + // + // template + // static HostDataType transferAsync(TQueue& queue, TDeviceData const& deviceData); + // + // The function should allocate a HostDataType object and launch the + // transfers +} // namespace cms::alpakatools + +#endif diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h index 94e0e7cc..ee466f94 100644 --- a/code/alpaka_interface/getDeviceCachingAllocator.h +++ b/code/alpaka_interface/getDeviceCachingAllocator.h @@ -1,6 +1,7 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h #define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h +#include #include #include "thread_safety_macros.h" diff --git a/code/alpaka_interface/host.h b/code/alpaka_interface/host.h index 0303313d..acb9c9a9 100644 --- a/code/alpaka_interface/host.h +++ b/code/alpaka_interface/host.h @@ -1,6 +1,8 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_host_h #define HeterogeneousCore_AlpakaInterface_interface_host_h +#include + namespace cms::alpakatools { namespace detail { @@ -19,7 +21,7 @@ namespace cms::alpakatools { } // namespace detail // returns the alpaka host device - static inline alpaka::DevCpu const& host() { + inline alpaka::DevCpu const& host() { static const auto host = detail::enumerate_host(); return host; } diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h index cbdc6fc0..f572ab45 100644 --- a/code/alpaka_interface/memory.h +++ b/code/alpaka_interface/memory.h @@ -77,6 +77,26 @@ namespace cms::alpakatools { return alpaka::allocBuf, Idx>(host(), Vec1D{std::extent_v}); } + // non-cached, pinned, scalar and 1-dimensional host buffers + // the memory is pinned according to the device associated to the platform + + template + std::enable_if_t, host_buffer> make_host_buffer() { + return alpaka::allocMappedBuf(host(), Scalar{}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer(Extent extent) { + return alpaka::allocMappedBuf, Idx>(host(), Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer() { + return alpaka::allocMappedBuf, Idx>(host(), Vec1D{std::extent_v}); + } + // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue // the memory is pinned according to the device associated to the queue @@ -85,7 +105,7 @@ namespace cms::alpakatools { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(host(), queue, Scalar{}); } else { - return alpaka::allocMappedBuf(host(), alpaka::getDev(queue), Scalar{}); + return alpaka::allocMappedBuf>, T, Idx>(host(), Scalar{}); } } @@ -96,7 +116,8 @@ namespace cms::alpakatools { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host(), queue, Vec1D{extent}); } else { - return alpaka::allocMappedBuf, Idx>(host(), alpaka::getDev(queue), Vec1D{extent}); + return alpaka::allocMappedBuf>, std::remove_extent_t, Idx>(host(), + Vec1D{extent}); } } @@ -107,8 +128,8 @@ namespace cms::alpakatools { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host(), queue, Vec1D{std::extent_v}); } else { - return alpaka::allocMappedBuf, Idx>( - host(), alpaka::getDev(queue), Vec1D{std::extent_v}); + return alpaka::allocMappedBuf>, std::remove_extent_t, Idx>( + host(), Vec1D{std::extent_v}); } } diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h index 8235a416..3083cda7 100644 --- a/code/alpaka_interface/traits.h +++ b/code/alpaka_interface/traits.h @@ -32,37 +32,34 @@ namespace cms::alpakatools { // is_platform template - struct is_platform - : std::integral_constant::value> {}; + using is_platform = alpaka::concepts::ImplementsConcept; template - constexpr bool is_platform_v = is_platform::value; + inline constexpr bool is_platform_v = is_platform::value; // is_device template - struct is_device : std::integral_constant::value> {}; + using is_device = alpaka::concepts::ImplementsConcept; template - constexpr bool is_device_v = is_device::value; + inline constexpr bool is_device_v = is_device::value; // is_accelerator template - struct is_accelerator - : std::integral_constant::value> {}; + using is_accelerator = alpaka::concepts::ImplementsConcept; template - constexpr bool is_accelerator_v = is_accelerator::value; + inline constexpr bool is_accelerator_v = is_accelerator::value; // is_queue template - struct is_queue : std::integral_constant::value> { - }; + using is_queue = alpaka::concepts::ImplementsConcept; template - constexpr bool is_queue_v = is_queue::value; + inline constexpr bool is_queue_v = is_queue::value; } // namespace cms::alpakatools diff --git a/code/alpaka_interface/vec.h b/code/alpaka_interface/vec.h new file mode 100644 index 00000000..4126eecf --- /dev/null +++ b/code/alpaka_interface/vec.h @@ -0,0 +1,42 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h +#define HeterogeneousCore_AlpakaInterface_interface_vec_h + +#include + +#include + +namespace alpaka { + + //! \return The element-wise minimum of one or more vectors. + ALPAKA_NO_HOST_ACC_WARNING + template , Vecs> && ...)>> + ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec const& p, Vecs const&... qs) -> Vec { + Vec r; + if constexpr (TDim::value > 0) { + for (typename TDim::value_type i = 0; i < TDim::value; ++i) + r[i] = std::min({p[i], qs[i]...}); + } + return r; + } + + //! \return The element-wise maximum of one or more vectors. + ALPAKA_NO_HOST_ACC_WARNING + template , Vecs> && ...)>> + ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec const& p, Vecs const&... qs) -> Vec { + Vec r; + if constexpr (TDim::value > 0) { + for (typename TDim::value_type i = 0; i < TDim::value; ++i) + r[i] = std::max({p[i], qs[i]...}); + } + return r; + } + +} // namespace alpaka + +#endif // HeterogeneousCore_AlpakaInterface_interface_vec_h diff --git a/code/alpaka_interface/workdivision.h b/code/alpaka_interface/workdivision.h new file mode 100644 index 00000000..fd3e10b3 --- /dev/null +++ b/code/alpaka_interface/workdivision.h @@ -0,0 +1,266 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h +#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h + +#include + +#include + +#include "config.h" +#include "traits.h" +#include "vec.h" + +namespace cms::alpakatools { + + using namespace alpaka_common; + + // If the first argument is not a multiple of the second argument, round it up to the next multiple + inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; } + + // Return the integer division of the first argument by the second argument, rounded up to the next integer + inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } + + // Create an accelerator-dependent work division for 1-dimensional kernels + template and alpaka::Dim::value == 1>> + inline WorkDiv make_workdiv(Idx blocks, Idx elements) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Idx{1}); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Idx{1}); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - the number of threads per block is always 1; + // - the number of elements per thread is "elements". + return WorkDiv(blocks, Idx{1}, elements); + } + } + + // Create the accelerator-dependent workdiv for N-dimensional kernels + template >> + inline WorkDiv> make_workdiv(const Vec>& blocks, + const Vec>& elements) { + using Dim = alpaka::Dim; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Vec::ones()); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Vec::ones()); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - the number of threads per block is always 1; + // - the number of elements per thread is "elements". + return WorkDiv(blocks, Vec::ones(), elements); + } + } + + template and alpaka::Dim::value == 1>> + class elements_with_stride { + public: + ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + first_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + first_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + + class iterator { + friend class elements_with_stride; + + ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + last_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < last_) + return *this; + + // increment the thread index with the grid stride + first_ += stride_ * elements_; + index_ = first_; + last_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + last_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx last_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + + private: + const Idx elements_; + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + + template and (alpaka::Dim::value > 0)>> + class elements_with_stride_nd { + public: + using Dim = alpaka::Dim; + using Vec = alpaka::Vec; + + ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)}, + first_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent) + : elements_{alpaka::getWorkDiv(acc)}, + first_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{extent} {} + + class iterator { + friend class elements_with_stride_nd; + constexpr static const auto last_dimension = Dim::value - 1; + + ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{alpaka::elementwise_min(first, extent)}, + index_{first_}, + last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {} + + public: + ALPAKA_FN_ACC inline Vec operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the index along the elements processed by the current thread + ++index_[last_dimension]; + if (index_[last_dimension] < last_) + return *this; + + // increment the thread index along with the last dimension with the grid stride + first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension]; + index_[last_dimension] = first_[last_dimension]; + last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); + if (index_[last_dimension] < extent_[last_dimension]) + return *this; + + // increment the thread index along the outer dimensions with the grid stride + if constexpr (last_dimension > 0) + for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) { + first_[dimension] += stride_[dimension]; + index_[dimension] = first_[dimension]; + if (index_[dimension] < extent_[dimension]) + return *this; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + last_ = extent_[last_dimension]; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Vec elements_; + Vec stride_; + Vec extent_; + // modified by the pre/post-increment operator + Vec first_; + Vec index_; + Idx last_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + + private: + const Vec elements_; + const Vec first_; + const Vec stride_; + const Vec extent_; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h diff --git a/setup_cgpu.sh b/setup_cgpu.sh index a30c0bf8..3d9b909f 100644 --- a/setup_cgpu.sh +++ b/setup_cgpu.sh @@ -7,7 +7,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $DIR/code/rooutil/thisrooutil.sh export SCRAM_ARCH=el8_amd64_gcc10 -export CMSSW_VERSION=CMSSW_12_5_0_pre5 +export CMSSW_VERSION=CMSSW_13_0_0_pre2 export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/ source /cvmfs/cms.cern.ch/cmsset_default.sh @@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" #eof diff --git a/setup_lnx7188.sh b/setup_lnx7188.sh index a56a41c3..69ae75a0 100644 --- a/setup_lnx7188.sh +++ b/setup_lnx7188.sh @@ -7,7 +7,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $DIR/code/rooutil/thisrooutil.sh export SCRAM_ARCH=el8_amd64_gcc10 -export CMSSW_VERSION=CMSSW_12_5_0_pre5 +export CMSSW_VERSION=CMSSW_13_0_0_pre2 export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/ source /cvmfs/cms.cern.ch/cmsset_default.sh @@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/cdat/tem/${USER}/LSTPerformanceWeb" export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh +source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" #eof From 37d4217c5e3d38f7d414aadc0d4f169f1fe3fd4f Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Thu, 13 Jul 2023 21:33:08 -0400 Subject: [PATCH 40/44] remove no_host_acc_warnings and cleanup --- SDL/Hit.cuh | 2 -- SDL/Kernels.cuh | 5 ----- SDL/MiniDoublet.cuh | 7 ------- SDL/PixelTriplet.cuh | 2 -- SDL/Quintuplet.cuh | 3 --- SDL/Segment.cuh | 4 ---- SDL/TrackCandidate.cuh | 7 ------- SDL/Triplet.cuh | 3 --- 8 files changed, 33 deletions(-) diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh index df348127..f4651e1a 100644 --- a/SDL/Hit.cuh +++ b/SDL/Hit.cuh @@ -215,7 +215,6 @@ namespace SDL struct moduleRangesKernel { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -246,7 +245,6 @@ namespace SDL struct hitLoopKernel { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index 51ff3e95..a3f761eb 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -202,7 +202,6 @@ namespace SDL struct removeDupQuintupletsInGPUAfterBuild { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -273,7 +272,6 @@ namespace SDL struct removeDupQuintupletsInGPUBeforeTC { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -359,7 +357,6 @@ namespace SDL struct removeDupPixelTripletsInGPUFromMap { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -407,7 +404,6 @@ namespace SDL struct removeDupPixelQuintupletsInGPUFromMap { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -452,7 +448,6 @@ namespace SDL struct checkHitspLS { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh index 1574a662..ef8abcd8 100644 --- a/SDL/MiniDoublet.cuh +++ b/SDL/MiniDoublet.cuh @@ -770,7 +770,6 @@ namespace SDL struct createMiniDoubletsInGPUv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -792,10 +791,6 @@ namespace SDL int nLowerHits = hitsInGPU.hitRangesnLower[lowerModuleIndex]; int nUpperHits = hitsInGPU.hitRangesnUpper[lowerModuleIndex]; if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) continue; - if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) - { - printf("IS THIS EVER RUN"); - } const int maxHits = alpaka::math::max(acc, nUpperHits, nLowerHits); unsigned int upHitArrayIndex = hitsInGPU.hitRangesUpper[lowerModuleIndex]; unsigned int loHitArrayIndex = hitsInGPU.hitRangesLower[lowerModuleIndex]; @@ -844,7 +839,6 @@ namespace SDL struct createMDArrayRangesGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -914,7 +908,6 @@ namespace SDL struct addMiniDoubletRangesToEventExplicit { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh index a8e76c2e..7f0a1f16 100644 --- a/SDL/PixelTriplet.cuh +++ b/SDL/PixelTriplet.cuh @@ -846,7 +846,6 @@ namespace SDL struct createPixelTripletsInGPUFromMapv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2246,7 +2245,6 @@ namespace SDL struct createPixelQuintupletsInGPUFromMapv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh index e29d8bde..67e9fb54 100644 --- a/SDL/Quintuplet.cuh +++ b/SDL/Quintuplet.cuh @@ -2164,7 +2164,6 @@ namespace SDL struct createQuintupletsInGPUv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2259,7 +2258,6 @@ namespace SDL struct createEligibleModulesListForQuintupletsGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -2335,7 +2333,6 @@ namespace SDL struct addQuintupletRangesToEventExplicit { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh index ab8dd13d..c3d2475f 100644 --- a/SDL/Segment.cuh +++ b/SDL/Segment.cuh @@ -678,7 +678,6 @@ namespace SDL struct createSegmentsInGPUv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -755,7 +754,6 @@ namespace SDL struct createSegmentArrayRanges { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -833,7 +831,6 @@ namespace SDL struct addSegmentRangesToEventExplicit { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -866,7 +863,6 @@ namespace SDL struct addPixelSegmentToEventKernel { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh index 250f4558..12a602f2 100644 --- a/SDL/TrackCandidate.cuh +++ b/SDL/TrackCandidate.cuh @@ -190,7 +190,6 @@ namespace SDL struct crossCleanpT3 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -239,7 +238,6 @@ namespace SDL struct crossCleanT5 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -306,7 +304,6 @@ namespace SDL // This will eliminate the need for another kernel just for adding the pLS, because we can __syncthreads() struct crossCleanpLS { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -393,7 +390,6 @@ namespace SDL struct addpT3asTrackCandidatesInGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -429,7 +425,6 @@ namespace SDL struct addT5asTrackCandidateInGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -467,7 +462,6 @@ namespace SDL struct addpLSasTrackCandidateInGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -497,7 +491,6 @@ namespace SDL struct addpT5asTrackCandidateInGPU { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh index 08c59d34..49454a90 100644 --- a/SDL/Triplet.cuh +++ b/SDL/Triplet.cuh @@ -1296,7 +1296,6 @@ namespace SDL struct createTripletsInGPUv2 { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -1372,7 +1371,6 @@ namespace SDL struct createTripletArrayRanges { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, @@ -1449,7 +1447,6 @@ namespace SDL struct addTripletRangesToEventExplicit { - ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC void operator()( TAcc const & acc, From 37cd75fe7b705ef16e05b8f462f0407da96d6732 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Fri, 14 Jul 2023 12:59:25 -0400 Subject: [PATCH 41/44] turn off half precision code --- SDL/Constants.cuh | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index d3b698cd..854fe419 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -6,56 +6,54 @@ // CUDA headers. Will be removed soon. #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -#include #include -#include #endif //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters -#if defined(FP16_Base) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) -#define __F2H __float2half -#define __H2F __half2float -typedef __half FPX; +#if defined(FP16_Base) +#define __F2H //__float2half +#define __H2F //__half2float +typedef float FPX; #else #define __F2H #define __H2F typedef float FPX; #endif -#if defined(FP16_T5) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes T5 regression values -#define __F2H_T5 __float2half -#define __H2F_T5 __half2float -typedef __half FPX_T5; +#if defined(FP16_T5) // changes T5 regression values +#define __F2H_T5 //__float2half +#define __H2F_T5 //__half2float +typedef float FPX_T5; #else #define __F2H_T5 #define __H2F_T5 typedef float FPX_T5; #endif -#if defined(FP16_dPhi) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment dPhi values -#define __F2H_dPhi __float2half -#define __H2F_dPhi __half2float -typedef __half FPX_dPhi; +#if defined(FP16_dPhi) // changes segment dPhi values +#define __F2H_dPhi //__float2half +#define __H2F_dPhi //__half2float +typedef float FPX_dPhi; #else #define __F2H_dPhi #define __H2F_dPhi typedef float FPX_dPhi; #endif -#if defined(FP16_circle) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment circle values -#define __F2H_circle __float2half -#define __H2F_circle __half2float -typedef __half FPX_circle; +#if defined(FP16_circle) // changes segment circle values +#define __F2H_circle //__float2half +#define __H2F_circle //__half2float +typedef float FPX_circle; #else #define __F2H_circle #define __H2F_circle typedef float FPX_circle; #endif -#if defined(FP16_seg) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment values -#define __F2H_seg __float2half -#define __H2F_seg __half2float -typedef __half FPX_seg; +#if defined(FP16_seg) // changes segment values +#define __F2H_seg //__float2half +#define __H2F_seg //__half2float +typedef float FPX_seg; #else #define __F2H_seg #define __H2F_seg From 8b966b7cc7d61c461115c15e8c3936a9b8121752 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Fri, 14 Jul 2023 13:21:50 -0400 Subject: [PATCH 42/44] remove unused wrapper functions --- SDL/Constants.cuh | 51 +++++------------------------------------------ 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh index 854fe419..e52de941 100644 --- a/SDL/Constants.cuh +++ b/SDL/Constants.cuh @@ -4,60 +4,19 @@ #include #include "../code/alpaka_interface/CachedBufAlloc.h" -// CUDA headers. Will be removed soon. #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED #include #endif -//This changes pT5 and pT3 and T3 completely. T5 for non regression parameters +// Half precision wrapper functions, turned off. #if defined(FP16_Base) -#define __F2H //__float2half -#define __H2F //__half2float -typedef float FPX; +#define __F2H //__float2half +#define __H2F //__half2float +typedef /*__half*/ float FPX; #else #define __F2H #define __H2F -typedef float FPX; -#endif - -#if defined(FP16_T5) // changes T5 regression values -#define __F2H_T5 //__float2half -#define __H2F_T5 //__half2float -typedef float FPX_T5; -#else -#define __F2H_T5 -#define __H2F_T5 -typedef float FPX_T5; -#endif - -#if defined(FP16_dPhi) // changes segment dPhi values -#define __F2H_dPhi //__float2half -#define __H2F_dPhi //__half2float -typedef float FPX_dPhi; -#else -#define __F2H_dPhi -#define __H2F_dPhi -typedef float FPX_dPhi; -#endif - -#if defined(FP16_circle) // changes segment circle values -#define __F2H_circle //__float2half -#define __H2F_circle //__half2float -typedef float FPX_circle; -#else -#define __F2H_circle -#define __H2F_circle -typedef float FPX_circle; -#endif - -#if defined(FP16_seg) // changes segment values -#define __F2H_seg //__float2half -#define __H2F_seg //__half2float -typedef float FPX_seg; -#else -#define __F2H_seg -#define __H2F_seg -typedef float FPX_seg; +typedef float FPX; #endif using Idx = std::size_t; From 9a84b03a5aad32a38a6ccce3ee3fea9be4ac9947 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 16 Jul 2023 12:49:53 -0400 Subject: [PATCH 43/44] removed unused score variables --- SDL/Kernels.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh index a3f761eb..ed845382 100644 --- a/SDL/Kernels.cuh +++ b/SDL/Kernels.cuh @@ -372,15 +372,13 @@ namespace SDL for (unsigned int ix = globalThreadIdx[1]; ix < *pixelTripletsInGPU.nPixelTriplets; ix += gridThreadExtent[1]) { - float score1 = __H2F(pixelTripletsInGPU.score[ix]); for(unsigned int jx = globalThreadIdx[2]; jx < *pixelTripletsInGPU.nPixelTriplets; jx += gridThreadExtent[2]) { - float score2 = __H2F(pixelTripletsInGPU.score[jx]); if(ix == jx) continue; int nMatched[2]; - checkHitspT3(ix,jx,pixelTripletsInGPU,nMatched); + checkHitspT3(ix, jx, pixelTripletsInGPU, nMatched); if((nMatched[0] + nMatched[1]) >= 5) { // Check the layers From 46b91b8e5fae97b960786b5a1ee64fce341fb085 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 16 Jul 2023 15:41:13 -0400 Subject: [PATCH 44/44] group addXtoEventExplicit functions --- SDL/Event.cu | 140 +++++++++++++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 70 deletions(-) diff --git a/SDL/Event.cu b/SDL/Event.cu index 98fef91e..46595d63 100644 --- a/SDL/Event.cu +++ b/SDL/Event.cu @@ -388,76 +388,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0,st alpaka::wait(queue); } -void SDL::Event::addMiniDoubletsToEventExplicit() -{ - auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); - alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules); - - auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); - - auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules*2, queue); - alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2); - - alpaka::wait(queue); - - int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); - int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf); - - for(unsigned int i = 0; i(devHost, nLowerModules, queue); - alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules); - - auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); - - auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); - - alpaka::wait(queue); - - int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); - - for(unsigned int i = 0; iminiDoubletModuleOccupancy @@ -1207,6 +1137,76 @@ void SDL::Event::createPixelQuintuplets() #endif } +void SDL::Event::addMiniDoubletsToEventExplicit() +{ + auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules, queue); + alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules); + + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules*2, queue); + alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2); + + alpaka::wait(queue); + + int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf); + + for(unsigned int i = 0; i(devHost, nLowerModules, queue); + alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules); + + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules); + + alpaka::wait(queue); + + int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + + for(unsigned int i = 0; i(devHost, nLowerModules, queue);