From cacd983117d2ef70def8df3ef57d66416d905ccc Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 18 May 2023 19:12:08 -0400
Subject: [PATCH 01/44] remove unused memory functions

---
 SDL/MiniDoublet.cu     |  8 --------
 SDL/MiniDoublet.cuh    |  1 -
 SDL/Module.cu          | 33 ---------------------------------
 SDL/Module.cuh         |  1 -
 SDL/PixelTriplet.cu    | 25 -------------------------
 SDL/PixelTriplet.cuh   |  2 --
 SDL/Segment.cu         | 22 ----------------------
 SDL/Segment.cuh        |  1 -
 SDL/TrackCandidate.cu  | 20 --------------------
 SDL/TrackCandidate.cuh |  1 -
 SDL/Triplet.cu         | 11 -----------
 SDL/Triplet.cuh        |  1 -
 12 files changed, 126 deletions(-)

diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu
index f3fb98bf..3cfab8dd 100644
--- a/SDL/MiniDoublet.cu
+++ b/SDL/MiniDoublet.cu
@@ -1,13 +1,5 @@
 #include "MiniDoublet.cuh"
 
-void SDL::miniDoublets::resetMemory(unsigned int nMemoryLocationsx, unsigned int nLowerModules,cudaStream_t stream)
-{
-    cudaMemsetAsync(anchorHitIndices,0, nMemoryLocationsx * 3 * sizeof(unsigned int),stream);
-    cudaMemsetAsync(dphichanges,0, nMemoryLocationsx * 9 * sizeof(float),stream);
-    cudaMemsetAsync(nMDs,0, (nLowerModules + 1) * sizeof(int),stream);
-    cudaMemsetAsync(totOccupancyMDs,0, (nLowerModules + 1) * sizeof(unsigned int),stream);
-}
-
 //FIXME:Add memory locations for the pixel MDs here!
 void SDL::createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream)
 {
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 858a7fec..b8db05ca 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -64,7 +64,6 @@ namespace SDL
         ~miniDoublets();
       	void freeMemory(cudaStream_t stream);
       	void freeMemoryCache();
-        void resetMemory(unsigned int nMemoryLocations, unsigned int nModules,cudaStream_t stream);
     };
 
     void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
diff --git a/SDL/Module.cu b/SDL/Module.cu
index 44427b52..e26eb899 100644
--- a/SDL/Module.cu
+++ b/SDL/Module.cu
@@ -163,39 +163,6 @@ void SDL::objectRanges::freeMemory()
     cudaFree(device_nTotalQuints);
 }
 
-void SDL::freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping)
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,modulesInGPU.detIds);
-    cms::cuda::free_device(dev,modulesInGPU.moduleMap);
-    cms::cuda::free_device(dev,modulesInGPU.mapIdx);
-    cms::cuda::free_device(dev,modulesInGPU.mapdetId);
-    cms::cuda::free_device(dev,modulesInGPU.nConnectedModules);
-    cms::cuda::free_device(dev,modulesInGPU.drdzs);
-    cms::cuda::free_device(dev,modulesInGPU.slopes);
-    cms::cuda::free_device(dev,modulesInGPU.nModules);
-    cms::cuda::free_device(dev,modulesInGPU.nLowerModules);
-    cms::cuda::free_device(dev,modulesInGPU.layers);
-    cms::cuda::free_device(dev,modulesInGPU.rings);
-    cms::cuda::free_device(dev,modulesInGPU.modules);
-    cms::cuda::free_device(dev,modulesInGPU.rods);
-    cms::cuda::free_device(dev,modulesInGPU.subdets);
-    cms::cuda::free_device(dev,modulesInGPU.sides);
-    cms::cuda::free_device(dev,modulesInGPU.isInverted);
-    cms::cuda::free_device(dev,modulesInGPU.isLower);
-    cms::cuda::free_device(dev,modulesInGPU.isAnchor);
-    cms::cuda::free_device(dev,modulesInGPU.moduleType);
-    cms::cuda::free_device(dev,modulesInGPU.moduleLayerType);
-    cms::cuda::free_device(dev,modulesInGPU.connectedPixels);
-    cudaFreeHost(pixelMapping.connectedPixelsSizes);
-    cudaFreeHost(pixelMapping.connectedPixelsSizesPos);
-    cudaFreeHost(pixelMapping.connectedPixelsSizesNeg);
-    cudaFreeHost(pixelMapping.connectedPixelsIndex);
-    cudaFreeHost(pixelMapping.connectedPixelsIndexPos);
-    cudaFreeHost(pixelMapping.connectedPixelsIndexNeg);
-}
-
 void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping)
 {
     cudaFree(modulesInGPU.detIds);
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index c68d6a77..6e48abaf 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -143,7 +143,6 @@ namespace SDL
     void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt");
     void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream);
     void freeModules(struct modules& modulesInGPU,struct pixelMap& pixelMapping);
-    void freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping);
     void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream);
     void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
     void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
diff --git a/SDL/PixelTriplet.cu b/SDL/PixelTriplet.cu
index 456eb7d7..dc0a2496 100644
--- a/SDL/PixelTriplet.cu
+++ b/SDL/PixelTriplet.cu
@@ -69,19 +69,6 @@ SDL::pixelTriplets::~pixelTriplets()
 {
 }
 
-void SDL::pixelTriplets::resetMemory(unsigned int maxPixelTriplets,cudaStream_t stream)
-{
-    cudaMemsetAsync(pixelSegmentIndices,0, maxPixelTriplets * sizeof(unsigned int),stream);
-    cudaMemsetAsync(tripletIndices, 0,maxPixelTriplets * sizeof(unsigned int),stream);
-    cudaMemsetAsync(nPixelTriplets, 0,sizeof(int),stream);
-    cudaMemsetAsync(totOccupancyPixelTriplets, 0,sizeof(int),stream);
-    cudaMemsetAsync(pixelRadius, 0,maxPixelTriplets * sizeof(FPX),stream);
-    cudaMemsetAsync(tripletRadius, 0,maxPixelTriplets * sizeof(FPX),stream);
-    cudaMemsetAsync(pt, 0,maxPixelTriplets * 6*sizeof(FPX),stream);
-    cudaMemsetAsync(isDup, 0,maxPixelTriplets * sizeof(bool),stream);
-    cudaMemsetAsync(partOfPT5, 0,maxPixelTriplets * sizeof(bool),stream);
-}
-
 void SDL::createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsInGPU, unsigned int maxPixelTriplets, cudaStream_t stream)
 {
 #ifdef CACHE_ALLOC
@@ -205,18 +192,6 @@ void SDL::pixelQuintuplets::freeMemory(cudaStream_t stream)
     cudaStreamSynchronize(stream);
 }
 
-void SDL::pixelQuintuplets::resetMemory(unsigned int maxPixelQuintuplets,cudaStream_t stream)
-{
-    cudaMemsetAsync(pixelIndices,0, maxPixelQuintuplets * sizeof(unsigned int),stream);
-    cudaMemsetAsync(T5Indices,0, maxPixelQuintuplets * sizeof(unsigned int),stream);
-    cudaMemsetAsync(nPixelQuintuplets,0, sizeof(int),stream);
-    cudaMemsetAsync(totOccupancyPixelQuintuplets,0, sizeof(int),stream);
-    cudaMemsetAsync(isDup,0, maxPixelQuintuplets * sizeof(bool),stream);
-    cudaMemsetAsync(score,0, maxPixelQuintuplets * sizeof(FPX),stream);
-    cudaMemsetAsync(eta , 0, maxPixelQuintuplets * sizeof(FPX),stream);
-    cudaMemsetAsync(phi , 0, maxPixelQuintuplets * sizeof(FPX),stream);
-}
-
 void SDL::createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets,cudaStream_t stream)
 {
 #ifdef CACHE_ALLOC
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 5fe49aa3..0f884bae 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -44,7 +44,6 @@ namespace SDL
         ~pixelTriplets();
         void freeMemory(cudaStream_t stream);
         void freeMemoryCache();
-        void resetMemory(unsigned int maxPixelTriplets,cudaStream_t stream);
     };
 
     void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
@@ -1381,7 +1380,6 @@ namespace SDL
         ~pixelQuintuplets();
         void freeMemory(cudaStream_t stream);
         void freeMemoryCache();
-        void resetMemory(unsigned int maxPixelQuintuplets,cudaStream_t stream);
 
     };
 
diff --git a/SDL/Segment.cu b/SDL/Segment.cu
index 79bd89a1..3d5f38eb 100644
--- a/SDL/Segment.cu
+++ b/SDL/Segment.cu
@@ -2,28 +2,6 @@
 
 ///FIXME:NOTICE THE NEW maxPixelSegments!
 
-void SDL::segments::resetMemory(unsigned int nMemoryLocationsx, unsigned int nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream)
-{
-    cudaMemsetAsync(mdIndices,0, nMemoryLocationsx * 2 * sizeof(unsigned int),stream);
-    cudaMemsetAsync(innerLowerModuleIndices,0, nMemoryLocationsx * 2 * sizeof(uint16_t),stream);
-    cudaMemsetAsync(nSegments, 0,(nLowerModules+1) * sizeof(int),stream);
-    cudaMemsetAsync(totOccupancySegments, 0,(nLowerModules+1) * sizeof(int),stream);
-    cudaMemsetAsync(dPhis, 0,(nMemoryLocationsx * 6 )*sizeof(FPX),stream);
-    cudaMemsetAsync(ptIn, 0,(maxPixelSegments * 8)*sizeof(float),stream);
-    cudaMemsetAsync(superbin, 0,(maxPixelSegments )*sizeof(int),stream);
-    cudaMemsetAsync(pixelType, 0,(maxPixelSegments )*sizeof(int8_t),stream);
-    cudaMemsetAsync(isQuad, 0,(maxPixelSegments )*sizeof(char),stream);
-    cudaMemsetAsync(isDup, 0,(maxPixelSegments )*sizeof(bool),stream);
-    cudaMemsetAsync(score, 0,(maxPixelSegments )*sizeof(float),stream);
-    cudaMemsetAsync(charge, 0,maxPixelSegments * sizeof(int),stream);
-    cudaMemsetAsync(seedIdx, 0,maxPixelSegments * sizeof(unsigned int),stream);
-    cudaMemsetAsync(circleCenterX, 0,maxPixelSegments * sizeof(float),stream);
-    cudaMemsetAsync(circleCenterY, 0,maxPixelSegments * sizeof(float),stream);
-    cudaMemsetAsync(circleRadius, 0,maxPixelSegments * sizeof(float),stream);
-    cudaMemsetAsync(partOfPT5, 0,maxPixelSegments * sizeof(bool),stream);
-    cudaMemsetAsync(pLSHitsIdxs, 0,maxPixelSegments * sizeof(uint4),stream);
-}
-
 void SDL::createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelSegments, cudaStream_t stream)
 {
     //FIXME:Since the number of pixel segments is 10x the number of regular segments per module, we need to provide
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 105e7ff1..0a9830cf 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -56,7 +56,6 @@ namespace SDL
 
         void freeMemory(cudaStream_t stream);
         void freeMemoryCache();
-        void resetMemory(unsigned int nMemoryLocationsx, unsigned int nModules, unsigned int maxPixelSegments,cudaStream_t stream);
     };
 
     void createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int maxSegments, uint16_t nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream);
diff --git a/SDL/TrackCandidate.cu b/SDL/TrackCandidate.cu
index d7c6dfdf..7853de30 100644
--- a/SDL/TrackCandidate.cu
+++ b/SDL/TrackCandidate.cu
@@ -1,25 +1,5 @@
 #include "TrackCandidate.cuh"
 
-void SDL::trackCandidates::resetMemory(unsigned int maxTrackCandidates,cudaStream_t stream)
-{
-    cudaMemsetAsync(trackCandidateType,0, maxTrackCandidates * sizeof(short),stream);
-    cudaMemsetAsync(directObjectIndices, 0, maxTrackCandidates * sizeof(unsigned int),stream);
-    cudaMemsetAsync(objectIndices, 0,2 * maxTrackCandidates * sizeof(unsigned int),stream);
-    cudaMemsetAsync(nTrackCandidates, 0,sizeof(int),stream);
-    cudaMemsetAsync(nTrackCandidatespT3, 0,sizeof(int),stream);
-    cudaMemsetAsync(nTrackCandidatesT5, 0,sizeof(int),stream);
-    cudaMemsetAsync(nTrackCandidatespT5,0, sizeof(int),stream);
-    cudaMemsetAsync(nTrackCandidatespLS, 0,sizeof(int),stream);
-
-    cudaMemsetAsync(logicalLayers, 0, 7 * maxTrackCandidates * sizeof(uint8_t), stream);
-    cudaMemsetAsync(lowerModuleIndices, 0, 7 * maxTrackCandidates * sizeof(uint16_t), stream);
-    cudaMemsetAsync(hitIndices, 0, 14 * maxTrackCandidates * sizeof(unsigned int), stream);
-    cudaMemsetAsync(pixelSeedIndex, 0, maxTrackCandidates * sizeof(int), stream);
-    cudaMemsetAsync(centerX, 0, maxTrackCandidates * sizeof(FPX), stream);
-    cudaMemsetAsync(centerY, 0, maxTrackCandidates * sizeof(FPX), stream);
-    cudaMemsetAsync(radius , 0, maxTrackCandidates * sizeof(FPX), stream);
-}
-
 void SDL::createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream)
 {
 #ifdef CACHE_ALLOC
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index d8221ee2..d81a570d 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -36,7 +36,6 @@ namespace SDL
         ~trackCandidates();
         void freeMemory(cudaStream_t stream);
         void freeMemoryCache();
-        void resetMemory(unsigned int maxTrackCandidates,cudaStream_t stream);
     };
 
     void createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream);
diff --git a/SDL/Triplet.cu b/SDL/Triplet.cu
index e568cc0a..218880e2 100644
--- a/SDL/Triplet.cu
+++ b/SDL/Triplet.cu
@@ -1,16 +1,5 @@
 #include "Triplet.cuh"
 
-void SDL::triplets::resetMemory(unsigned int maxTriplets, unsigned int nLowerModules,cudaStream_t stream)
-{
-    cudaMemsetAsync(segmentIndices,0, 5 * maxTriplets * sizeof(unsigned int),stream);
-    cudaMemsetAsync(nTriplets,0, nLowerModules * sizeof(unsigned int),stream);
-    cudaMemsetAsync(totOccupancyTriplets,0, nLowerModules * sizeof(unsigned int),stream);
-    cudaMemsetAsync(betaIn,0, maxTriplets * 3 * sizeof(FPX),stream);
-    cudaMemsetAsync(partOfPT5,0, maxTriplets * sizeof(bool),stream);
-    cudaMemsetAsync(partOfT5,0, maxTriplets * sizeof(bool), stream);
-    cudaMemsetAsync(partOfPT3, 0, maxTriplets * sizeof(bool), stream);
-}
-
 void SDL::createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules, cudaStream_t stream)
 {
 #ifdef CACHE_ALLOC
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index a0278931..16ea085d 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -54,7 +54,6 @@ namespace SDL
         ~triplets();
         void freeMemory(cudaStream_t stream);
         void freeMemoryCache();
-        void resetMemory(unsigned int maxTriplets, unsigned int nLowerModules,cudaStream_t stream);
     };
 
     void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream);

From efa60af2d43f4072ca0b7fb58c5465045bca73d0 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 13:24:07 -0400
Subject: [PATCH 02/44] first working segment memory w/o ntuple

---
 Makefile          |  10 +--
 SDL/Constants.cuh |  48 ++++++++++++-
 SDL/Event.cu      |  93 +++++++++++-------------
 SDL/Event.cuh     |  14 +---
 SDL/Makefile      |   4 +-
 SDL/Segment.cu    | 179 ----------------------------------------------
 SDL/Segment.cuh   | 159 +++++++++++++++++++++++++++++++++++-----
 7 files changed, 236 insertions(+), 271 deletions(-)
 delete mode 100644 SDL/Segment.cu

diff --git a/Makefile b/Makefile
index 2f4210b5..e4f18272 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,9 @@ CXXFLAGS    = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual
 LDFLAGS     = -g -O2
 ROOTLIBS    = $(shell root-config --libs)
 ROOTCFLAGS  = $(foreach option, $(shell root-config --cflags), --compiler-options $(option))
-CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
+ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr -DALPAKA_DEBUG=0
+CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O0 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
 EXTRACFLAGS = $(shell rooutil-config)
 EXTRAFLAGS  = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
 DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET
@@ -46,13 +48,13 @@ cutvalue_primitive: $(ROOUTIL) efficiency $(EXES)
 
 
 bin/doAnalysis: bin/doAnalysis.o $(OBJECTS)
-	$(CC) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) -o $@
+	$(CC) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@
 
 bin/sdl: bin/sdl.o $(OBJECTS)
-	$(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) -o $@
+	$(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@
 
 %.o: %.cc
-	$(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $< -dc -o $@
+	$(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -dc -o $@
 
 $(ROOUTIL):
 	$(MAKE) -C code/rooutil/
diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 9990a402..b45e45e4 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -1,9 +1,8 @@
 #ifndef Constants_cuh
 #define Constants_cuh
 
-#include <alpaka/alpaka.hpp>
-
 #include <cuda_fp16.h>
+#include <alpaka/alpaka.hpp>
 
 #ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
 #define __F2H __float2half  
@@ -51,6 +50,51 @@ typedef __half FPX_seg;
 typedef float FPX_seg; 
 #endif
 
+using Idx = std::size_t;
+using Dim = alpaka::DimInt<3u>;
+using Dim1d = alpaka::DimInt<1u>;
+using Vec = alpaka::Vec<Dim,Idx>;
+using Vec1d = alpaka::Vec<Dim1d,Idx>;
+using QueueProperty = alpaka::NonBlocking;
+using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+// - AccGpuCudaRt
+// - AccCpuThreads
+// - AccCpuFibers
+// - AccCpuSerial
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+#elif ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+    using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+#elif ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
+    using Acc = alpaka::AccCpuFibers<Dim, Idx>;
+#elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+    using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+#endif
+
+auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
+
+// Typical Buffer types used in the code.
+using float_Buf = alpaka::Buf<Acc, float, Dim1d, Idx>;
+using int_Buf = alpaka::Buf<Acc, int, Dim1d, Idx>;
+using uint_Buf = alpaka::Buf<Acc, unsigned int, Dim1d, Idx>;
+using int8_t_Buf = alpaka::Buf<Acc, int8_t, Dim1d, Idx>;
+using uint16_t_Buf = alpaka::Buf<Acc, uint16_t, Dim1d, Idx>;
+using char_Buf = alpaka::Buf<Acc, char, Dim1d, Idx>;
+using bool_Buf = alpaka::Buf<Acc, bool, Dim1d, Idx>;
+
+using FPX_Buf = alpaka::Buf<Acc, FPX, Dim1d, Idx>;
+using FPX_T5_Buf = alpaka::Buf<Acc, FPX_T5, Dim1d, Idx>;
+using FPX_dPhi_Buf = alpaka::Buf<Acc, FPX_dPhi, Dim1d, Idx>;
+using FPX_circle_Buf = alpaka::Buf<Acc, FPX_circle, Dim1d, Idx>;
+using FPX_seg_Buf = alpaka::Buf<Acc, FPX_seg, Dim1d, Idx>;
+
+template<typename T, typename TAcc, typename TSize>
+alpaka::Buf<Acc, T, Dim1d, Idx> inline allocBufWrapper(TAcc const & devAcc, TSize nElements) {
+    return alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim1d, Idx>(static_cast<Idx>(nElements)));
+}
+
 const unsigned int MAX_BLOCKS = 80;
 const unsigned int MAX_CONNECTED_MODULES = 40;
 const unsigned int N_MAX_PIXEL_MD_PER_MODULES = 100000;
diff --git a/SDL/Event.cu b/SDL/Event.cu
index 7bafc30e..a46c1a9a 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -66,18 +66,15 @@ SDL::Event::~Event()
     if(rangesInGPU){rangesInGPU->freeMemoryCache();}
     if(hitsInGPU){hitsInGPU->freeMemoryCache();}
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
-    if(segmentsInGPU){segmentsInGPU->freeMemoryCache();}
     if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
-
     if(rangesInGPU){rangesInGPU->freeMemory();}
     if(hitsInGPU){hitsInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
-    if(segmentsInGPU){segmentsInGPU->freeMemory(stream);}
     if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
@@ -266,18 +263,15 @@ void SDL::Event::resetEvent()
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(rangesInGPU){rangesInGPU->freeMemoryCache();}
-    if(segmentsInGPU){segmentsInGPU->freeMemoryCache();}
     if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
-
 #else
     if(hitsInGPU){hitsInGPU->freeMemory();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(rangesInGPU){rangesInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
-    if(segmentsInGPU){segmentsInGPU->freeMemory(stream);}
     if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
@@ -308,7 +302,7 @@ void SDL::Event::resetEvent()
       mdsInGPU = nullptr;}
     if(rangesInGPU){cms::cuda::free_host(rangesInGPU);
       rangesInGPU = nullptr;}
-    if(segmentsInGPU){cms::cuda::free_host(segmentsInGPU);
+    if(segmentsInGPU){delete segmentsInGPU;
       segmentsInGPU = nullptr;}
     if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU);
       tripletsInGPU = nullptr;}
@@ -747,7 +741,7 @@ struct addPixelSegmentToEventKernel
 
 void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,std::vector<unsigned int> hitIndices1,std::vector<unsigned int> hitIndices2,std::vector<unsigned int> hitIndices3, std::vector<float> dPhiChange, std::vector<float> ptIn, std::vector<float> ptErr, std::vector<float> px, std::vector<float> py, std::vector<float> pz, std::vector<float> eta, std::vector<float> etaErr, std::vector<float> phi, std::vector<int> charge, std::vector<unsigned int> seedIdx, std::vector<int> superbin, std::vector<int8_t> pixelType, std::vector<char> isQuad)
 {
-    const int size = ptIn.size();
+    int size = ptIn.size();
     unsigned int mdSize = 2 * size;
     uint16_t pixelModuleIndex = (*detIdToIndex)[1];
 
@@ -782,7 +776,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
     }
     if(segmentsInGPU == nullptr)
     {
-        segmentsInGPU = (SDL::segments*)cms::cuda::allocate_host(sizeof(SDL::segments), stream);
         // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
         // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them
 
@@ -804,43 +797,47 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         cudaMemcpyAsync(&nTotalSegments,rangesInGPU->device_nTotalSegs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
         cudaStreamSynchronize(stream);
         nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
-        createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
+
+        segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
 
         cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
         cudaStreamSynchronize(stream);
     }
 
-    unsigned int* hitIndices0_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream);
-    unsigned int* hitIndices1_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream);
-    unsigned int* hitIndices2_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream);
-    unsigned int* hitIndices3_dev = (unsigned int*)cms::cuda::allocate_device(dev, size*sizeof(unsigned int), stream);
-    float* dPhiChange_dev = (float*)cms::cuda::allocate_device(dev, size*sizeof(float), stream);
-
-    cudaMemcpyAsync(hitIndices0_dev, &hitIndices0[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitIndices1_dev, &hitIndices1[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitIndices2_dev, &hitIndices2[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitIndices3_dev, &hitIndices3[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(dPhiChange_dev, &dPhiChange[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-
-    cudaMemcpyAsync(segmentsInGPU->isQuad, &isQuad[0], size*sizeof(char), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->ptIn, &ptIn[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->ptErr, &ptErr[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->px, &px[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->py, &py[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->pz, &pz[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->etaErr, &etaErr[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->eta, &eta[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->phi, &phi[0], size*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->charge, &charge[0], size*sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->seedIdx, &seedIdx[0], size*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->superbin, &superbin[0], size*sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(segmentsInGPU->pixelType, &pixelType[0], size*sizeof(int8_t), cudaMemcpyHostToDevice, stream);
+    alpaka::Vec<Dim1d, Idx> const extent(static_cast<Idx>(size));
+
+    auto hitIndices0_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
+    auto hitIndices1_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
+    auto hitIndices2_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
+    auto hitIndices3_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
+    auto dPhiChange_dev = alpaka::allocBuf<float, Idx>(devAcc, extent);
+
+    alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size);
+    alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size);
+    alpaka::memcpy(queue, hitIndices2_dev, hitIndices2, size);
+    alpaka::memcpy(queue, hitIndices3_dev, hitIndices3, size);
+    alpaka::memcpy(queue, dPhiChange_dev, dPhiChange, size);
+
+    alpaka::memcpy(queue, segmentsInGPU->ptIn_buf, ptIn, size);
+    alpaka::memcpy(queue, segmentsInGPU->ptErr_buf, ptErr, size);
+    alpaka::memcpy(queue, segmentsInGPU->px_buf, px, size);
+    alpaka::memcpy(queue, segmentsInGPU->py_buf, py, size);
+    alpaka::memcpy(queue, segmentsInGPU->pz_buf, pz, size);
+    alpaka::memcpy(queue, segmentsInGPU->etaErr_buf, etaErr, size);
+    alpaka::memcpy(queue, segmentsInGPU->isQuad_buf, isQuad, size);
+    alpaka::memcpy(queue, segmentsInGPU->eta_buf, eta, size);
+    alpaka::memcpy(queue, segmentsInGPU->phi_buf, phi, size);
+    alpaka::memcpy(queue, segmentsInGPU->charge_buf, charge, size);
+    alpaka::memcpy(queue, segmentsInGPU->seedIdx_buf, seedIdx, size);
+    alpaka::memcpy(queue, segmentsInGPU->superbin_buf, superbin, size);
+    alpaka::memcpy(queue, segmentsInGPU->pixelType_buf, pixelType, size);
 
     cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
     cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
     cudaMemcpyAsync(&(mdsInGPU->nMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
     cudaMemcpyAsync(&(mdsInGPU->totOccupancyMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
     cudaStreamSynchronize(stream);
+    alpaka::wait(queue);
 
     Vec const threadsPerBlock(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
     Vec const blocksPerGrid(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS));
@@ -855,23 +852,16 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         *hitsInGPU,
         *mdsInGPU,
         *segmentsInGPU,
-        hitIndices0_dev,
-        hitIndices1_dev,
-        hitIndices2_dev,
-        hitIndices3_dev,
-        dPhiChange_dev,
+        alpaka::getPtrNative(hitIndices0_dev),
+        alpaka::getPtrNative(hitIndices1_dev),
+        alpaka::getPtrNative(hitIndices2_dev),
+        alpaka::getPtrNative(hitIndices3_dev),
+        alpaka::getPtrNative(dPhiChange_dev),
         pixelModuleIndex,
         size));
 
     alpaka::enqueue(queue, addPixelSegmentToEvent_task);
     alpaka::wait(queue);
-
-    cms::cuda::free_device(dev, hitIndices0_dev);
-    cms::cuda::free_device(dev, hitIndices1_dev);
-    cms::cuda::free_device(dev, hitIndices2_dev);
-    cms::cuda::free_device(dev, hitIndices3_dev);
-    cms::cuda::free_device(dev, dPhiChange_dev);
-    cudaStreamSynchronize(stream);
 }
 
 void SDL::Event::addMiniDoubletsToEventExplicit()
@@ -1055,8 +1045,7 @@ void SDL::Event::createSegmentsWithModuleMap()
 {
     if(segmentsInGPU == nullptr)
     {
-        segmentsInGPU = (SDL::segments*)cms::cuda::allocate_host(sizeof(SDL::segments), stream);
-        createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
+        segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
     }
 
     Vec const threadsPerBlockCreateSeg(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(64));
@@ -1133,7 +1122,7 @@ void SDL::Event::createTriplets()
     uint16_t *index_gpu;
     index_gpu = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules*sizeof(uint16_t), stream);
     unsigned int *nSegments = (unsigned int*)malloc(nLowerModules*sizeof(unsigned int));
-    cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream); 
+    cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
     cudaStreamSynchronize(stream);
 
     uint16_t* module_nConnectedModules;
@@ -1141,7 +1130,7 @@ void SDL::Event::createTriplets()
     cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
     cudaStreamSynchronize(stream);
 
-    for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex <nLowerModules; innerLowerModuleIndex++) 
+    for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex <nLowerModules; innerLowerModuleIndex++)
     {
         uint16_t nConnectedModules = module_nConnectedModules[innerLowerModuleIndex];
         unsigned int nInnerSegments = nSegments[innerLowerModuleIndex];
@@ -2129,7 +2118,7 @@ SDL::segments* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
-        segmentsInCPU = new SDL::segments;
+        segmentsInCPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
         
         segmentsInCPU->nSegments = new int[nLowerModules+1];
         cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream);
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index dad2c933..52e14448 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -14,27 +14,15 @@
 
 #include "allocate.h"
 
-// Temporary alpaka statements
-using Dim = alpaka::DimInt<3u>;
-using Idx = std::size_t;
-using Vec = alpaka::Vec<Dim,Idx>;
-using QueueProperty = alpaka::NonBlocking;
-using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-
 namespace SDL
 {
     class Event
     {
     private:
+        QueueAcc queue;
         cudaStream_t stream;
         bool addObjects;
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-        using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
-        using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
-        QueueAcc queue;
-#endif
-
         std::array<unsigned int, 6> n_hits_by_layer_barrel_;
         std::array<unsigned int, 5> n_hits_by_layer_endcap_;
         std::array<unsigned int, 6> n_minidoublets_by_layer_barrel_;
diff --git a/SDL/Makefile b/SDL/Makefile
index a146a93d..c518ee68 100644
--- a/SDL/Makefile
+++ b/SDL/Makefile
@@ -20,7 +20,7 @@ LIB=libsdl.so
 CXX                  = nvcc
 CXXFLAGS             =  -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I..
 ROOTCFLAGS           = --compiler-options -pthread --compiler-options -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include
-ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
+ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 ALPAKAFLAGS          = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr -DALPAKA_DEBUG=0
 LD                   = nvcc 
 SOFLAGS              = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72
@@ -45,7 +45,7 @@ CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG
 	$(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@
 
 %_cpu.o : %.cc %.h
-	$(LD) -O2   $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
+	$(LD) -O0   $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
 
 $(LIB):$(CCOBJECTS) $(CUOBJECTS)
 #$(LIB):$(CUOBJECTS)
diff --git a/SDL/Segment.cu b/SDL/Segment.cu
deleted file mode 100644
index 3d5f38eb..00000000
--- a/SDL/Segment.cu
+++ /dev/null
@@ -1,179 +0,0 @@
-#include "Segment.cuh"
-
-///FIXME:NOTICE THE NEW maxPixelSegments!
-
-void SDL::createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelSegments, cudaStream_t stream)
-{
-    //FIXME:Since the number of pixel segments is 10x the number of regular segments per module, we need to provide
-    //extra memory to the pixel segments
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    segmentsInGPU.mdIndices = (unsigned int*)cms::cuda::allocate_device(dev,nMemoryLocations*4 *sizeof(unsigned int),stream);
-    segmentsInGPU.innerLowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev,nMemoryLocations*2 *sizeof(uint16_t),stream);
-    segmentsInGPU.nSegments = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream);
-    segmentsInGPU.totOccupancySegments = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(unsigned int),stream);
-    segmentsInGPU.dPhis = (FPX*)cms::cuda::allocate_device(dev,nMemoryLocations*6 *sizeof(FPX),stream);
-    segmentsInGPU.ptIn = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * 8 *sizeof(float),stream);
-    segmentsInGPU.superbin = (int*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(int),stream);
-    segmentsInGPU.pixelType = (int8_t*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(int8_t),stream);
-    segmentsInGPU.isQuad = (char*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(char),stream);
-    segmentsInGPU.isDup = (bool*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(bool),stream);
-    segmentsInGPU.score = (float*)cms::cuda::allocate_device(dev,(maxPixelSegments) *sizeof(float),stream);
-    segmentsInGPU.charge = (int*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(int), stream);
-    segmentsInGPU.seedIdx = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(unsigned int), stream);
-    segmentsInGPU.circleCenterX = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream);
-    segmentsInGPU.circleCenterY = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream);
-    segmentsInGPU.circleRadius = (float*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(float), stream);
-    segmentsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(bool), stream);
-    segmentsInGPU.pLSHitsIdxs = (uint4*)cms::cuda::allocate_device(dev, maxPixelSegments * sizeof(uint4), stream);
-    segmentsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-#else
-    cudaMalloc(&segmentsInGPU.mdIndices, nMemoryLocations * 4 * sizeof(unsigned int));
-    cudaMalloc(&segmentsInGPU.innerLowerModuleIndices, nMemoryLocations * 2 * sizeof(uint16_t));
-    cudaMalloc(&segmentsInGPU.nSegments, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&segmentsInGPU.totOccupancySegments, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&segmentsInGPU.dPhis, nMemoryLocations * 6 *sizeof(FPX));
-    cudaMalloc(&segmentsInGPU.ptIn, maxPixelSegments * 8*sizeof(float));
-    cudaMalloc(&segmentsInGPU.superbin, (maxPixelSegments )*sizeof(int));
-    cudaMalloc(&segmentsInGPU.pixelType, (maxPixelSegments )*sizeof(int8_t));
-    cudaMalloc(&segmentsInGPU.isQuad, (maxPixelSegments )*sizeof(char));
-    cudaMalloc(&segmentsInGPU.isDup, (maxPixelSegments )*sizeof(bool));
-    cudaMalloc(&segmentsInGPU.score, (maxPixelSegments )*sizeof(float));
-    cudaMalloc(&segmentsInGPU.charge, maxPixelSegments * sizeof(int));
-    cudaMalloc(&segmentsInGPU.seedIdx, maxPixelSegments * sizeof(unsigned int));
-    cudaMalloc(&segmentsInGPU.circleCenterX, maxPixelSegments * sizeof(float));
-    cudaMalloc(&segmentsInGPU.circleCenterY, maxPixelSegments * sizeof(float));
-    cudaMalloc(&segmentsInGPU.circleRadius, maxPixelSegments * sizeof(float));
-    cudaMalloc(&segmentsInGPU.partOfPT5, maxPixelSegments * sizeof(bool));
-    cudaMalloc(&segmentsInGPU.pLSHitsIdxs, maxPixelSegments * sizeof(uint4));
-    cudaMalloc(&segmentsInGPU.nMemoryLocations, sizeof(unsigned int));
-#endif
-    segmentsInGPU.outerLowerModuleIndices = segmentsInGPU.innerLowerModuleIndices + nMemoryLocations;
-    segmentsInGPU.innerMiniDoubletAnchorHitIndices = segmentsInGPU.mdIndices + nMemoryLocations * 2;
-    segmentsInGPU.outerMiniDoubletAnchorHitIndices = segmentsInGPU.mdIndices + nMemoryLocations * 3;
-
-    segmentsInGPU.dPhiMins = segmentsInGPU.dPhis + nMemoryLocations;
-    segmentsInGPU.dPhiMaxs = segmentsInGPU.dPhis + nMemoryLocations * 2;
-    segmentsInGPU.dPhiChanges = segmentsInGPU.dPhis + nMemoryLocations * 3;
-    segmentsInGPU.dPhiChangeMins = segmentsInGPU.dPhis + nMemoryLocations * 4;
-    segmentsInGPU.dPhiChangeMaxs = segmentsInGPU.dPhis + nMemoryLocations * 5;
-
-    segmentsInGPU.ptErr  = segmentsInGPU.ptIn + maxPixelSegments;
-    segmentsInGPU.px     = segmentsInGPU.ptIn + maxPixelSegments * 2;
-    segmentsInGPU.py     = segmentsInGPU.ptIn + maxPixelSegments * 3;
-    segmentsInGPU.pz     = segmentsInGPU.ptIn + maxPixelSegments * 4;
-    segmentsInGPU.etaErr = segmentsInGPU.ptIn + maxPixelSegments * 5;
-    segmentsInGPU.eta    = segmentsInGPU.ptIn + maxPixelSegments * 6;
-    segmentsInGPU.phi    = segmentsInGPU.ptIn + maxPixelSegments * 7;
-
-    cudaMemsetAsync(segmentsInGPU.nSegments,0, (nLowerModules + 1) * sizeof(int),stream);
-    cudaMemsetAsync(segmentsInGPU.totOccupancySegments,0, (nLowerModules + 1) * sizeof(int),stream);
-    cudaMemsetAsync(segmentsInGPU.partOfPT5, false, maxPixelSegments * sizeof(bool),stream);
-    cudaMemsetAsync(segmentsInGPU.pLSHitsIdxs, 0, maxPixelSegments * sizeof(uint4),stream);
-    cudaMemsetAsync(segmentsInGPU.nMemoryLocations, nMemoryLocations, sizeof(unsigned int), stream);
-    cudaStreamSynchronize(stream);
-}
-
-SDL::segments::segments()
-{
-    superbin = nullptr;
-    pixelType = nullptr;
-    isQuad = nullptr;
-    isDup = nullptr;
-    score = nullptr;
-    circleRadius = nullptr;
-    charge = nullptr;
-    seedIdx = nullptr;
-    circleCenterX = nullptr;
-    circleCenterY = nullptr;
-    mdIndices = nullptr;
-    innerLowerModuleIndices = nullptr;
-    outerLowerModuleIndices = nullptr;
-    innerMiniDoubletAnchorHitIndices = nullptr;
-    outerMiniDoubletAnchorHitIndices = nullptr;
-
-    nSegments = nullptr;
-    totOccupancySegments = nullptr;
-    dPhis = nullptr;
-    dPhiMins = nullptr;
-    dPhiMaxs = nullptr;
-    dPhiChanges = nullptr;
-    dPhiChangeMins = nullptr;
-    dPhiChangeMaxs = nullptr;
-    partOfPT5 = nullptr;
-    pLSHitsIdxs = nullptr;
-}
-
-SDL::segments::~segments()
-{
-}
-
-void SDL::segments::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,mdIndices);
-    cms::cuda::free_device(dev,innerLowerModuleIndices);
-    cms::cuda::free_device(dev,dPhis);
-    cms::cuda::free_device(dev,ptIn);
-    cms::cuda::free_device(dev,nSegments);
-    cms::cuda::free_device(dev,totOccupancySegments);
-    cms::cuda::free_device(dev, charge);
-    cms::cuda::free_device(dev, seedIdx);
-    cms::cuda::free_device(dev,superbin);
-    cms::cuda::free_device(dev,pixelType);
-    cms::cuda::free_device(dev,isQuad);
-    cms::cuda::free_device(dev,isDup);
-    cms::cuda::free_device(dev,score);
-    cms::cuda::free_device(dev, circleCenterX);
-    cms::cuda::free_device(dev, circleCenterY);
-    cms::cuda::free_device(dev, circleRadius);
-    cms::cuda::free_device(dev, partOfPT5);
-    cms::cuda::free_device(dev, pLSHitsIdxs);
-    cms::cuda::free_device(dev, nMemoryLocations);
-}
-
-void SDL::segments::freeMemory(cudaStream_t stream)
-{
-    cudaFree(mdIndices);
-    cudaFree(innerLowerModuleIndices);
-    cudaFree(nSegments);
-    cudaFree(totOccupancySegments);
-    cudaFree(dPhis);
-    cudaFree(ptIn);
-    cudaFree(superbin);
-    cudaFree(pixelType);
-    cudaFree(isQuad);
-    cudaFree(isDup);
-    cudaFree(score);
-    cudaFree(charge);
-    cudaFree(seedIdx);
-    cudaFree(circleCenterX);
-    cudaFree(circleCenterY);
-    cudaFree(circleRadius);
-    cudaFree(partOfPT5);
-    cudaFree(pLSHitsIdxs);
-    cudaFree(nMemoryLocations);
-}
-
-void SDL::printSegment(struct SDL::segments& segmentsInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct SDL::modules& modulesInGPU, unsigned int segmentIndex)
-{
-    unsigned int innerMDIndex = segmentsInGPU.mdIndices[segmentIndex * 2];
-    unsigned int outerMDIndex = segmentsInGPU.mdIndices[segmentIndex * 2 + 1];
-    std::cout<<std::endl;
-    std::cout<<"sg_dPhiChange : "<<__H2F(segmentsInGPU.dPhiChanges[segmentIndex]) << std::endl<<std::endl;
-
-    std::cout << "Inner Mini-Doublet" << std::endl;
-    std::cout << "------------------------------" << std::endl;
-    {
-        IndentingOStreambuf indent(std::cout);
-        printMD(mdsInGPU, hitsInGPU, modulesInGPU, innerMDIndex);
-    }
-    std::cout<<std::endl<<" Outer Mini-Doublet" <<std::endl;
-    std::cout << "------------------------------" << std::endl;
-    {
-        IndentingOStreambuf indent(std::cout);
-        printMD(mdsInGPU, hitsInGPU, modulesInGPU, outerMDIndex);
-    }
-}
\ No newline at end of file
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 0a9830cf..ff789a16 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -8,28 +8,75 @@
 #include "Module.cuh"
 #include "Hit.cuh"
 
+using uint4_Buf = alpaka::Buf<Acc, uint4, Dim1d, Idx>;
+
 namespace SDL
 {
     struct segments
     {
-        unsigned int* nMemoryLocations;
+        // Buffer objects for each member variable
+        FPX_Buf dPhis_buf;
+        FPX_Buf dPhiMins_buf;
+        FPX_Buf dPhiMaxs_buf;
+        FPX_Buf dPhiChanges_buf;
+        FPX_Buf dPhiChangeMins_buf;
+        FPX_Buf dPhiChangeMaxs_buf;
+
+        uint16_t_Buf innerLowerModuleIndices_buf;
+        uint16_t_Buf outerLowerModuleIndices_buf;
+
+        uint_Buf seedIdx_buf;
+        uint_Buf mdIndices_buf;
+        uint_Buf innerMiniDoubletAnchorHitIndices_buf;
+        uint_Buf outerMiniDoubletAnchorHitIndices_buf;
+        uint_Buf nMemoryLocations_buf;
+
+        int_Buf nSegments_buf;
+        int_Buf totOccupancySegments_buf;
+        int_Buf charge_buf;
+        int_Buf superbin_buf;
+
+        uint4_Buf pLSHitsIdxs_buf;
+
+        int8_t_Buf pixelType_buf;
+
+        char_Buf isQuad_buf;
+
+        bool_Buf isDup_buf;
+        bool_Buf partOfPT5_buf;
+
+        float_Buf ptIn_buf;
+        float_Buf ptErr_buf;
+        float_Buf px_buf;
+        float_Buf py_buf;
+        float_Buf pz_buf;
+        float_Buf etaErr_buf;
+        float_Buf eta_buf;
+        float_Buf phi_buf;
+        float_Buf score_buf;
+        float_Buf circleCenterX_buf;
+        float_Buf circleCenterY_buf;
+        float_Buf circleRadius_buf;
+
+        // Pointers towards the data of each buffer
+        FPX* dPhis;
+        FPX* dPhiMins;
+        FPX* dPhiMaxs;
+        FPX* dPhiChanges;
+        FPX* dPhiChangeMins;
+        FPX* dPhiChangeMaxs;
 
-        unsigned int* mdIndices;
         uint16_t* innerLowerModuleIndices;
         uint16_t* outerLowerModuleIndices;
+
+        unsigned int* mdIndices;
+        unsigned int* nMemoryLocations;
         unsigned int* innerMiniDoubletAnchorHitIndices;
         unsigned int* outerMiniDoubletAnchorHitIndices;
-        
+
         int* nSegments; //number of segments per inner lower module
         int* totOccupancySegments; //number of segments per inner lower module
-        FPX* dPhis;
-        FPX* dPhiMins;
-        FPX* dPhiMaxs;
-        FPX* dPhiChanges;
-        FPX* dPhiChangeMins;
-        FPX* dPhiChangeMaxs;
 
-        //not so optional pixel dudes
         float* ptIn;
         float* ptErr;
         float* px;
@@ -51,15 +98,91 @@ namespace SDL
         bool* partOfPT5;
         uint4* pLSHitsIdxs;
 
-        segments();
-        ~segments();
-
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
+        template<typename TAcc, typename TQueue>
+        segments(unsigned int nMemoryLocationsIn,
+                    uint16_t nLowerModules,
+                    unsigned int maxPixelSegments,
+                    TAcc const & devAcc,
+                    TQueue& queue) :
+            mdIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn*2)),
+            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn)),
+            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn)),
+            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAcc, nMemoryLocationsIn)),
+            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAcc, nMemoryLocationsIn)),
+            nSegments_buf(allocBufWrapper<int>(devAcc, nLowerModules + 1)),
+            totOccupancySegments_buf(allocBufWrapper<int>(devAcc, nLowerModules + 1)),
+            dPhis_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            dPhiMins_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            dPhiMaxs_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            dPhiChanges_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            dPhiChangeMins_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
+            ptIn_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            ptErr_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            px_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            py_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            pz_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            etaErr_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            eta_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            phi_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            superbin_buf(allocBufWrapper<int>(devAcc, maxPixelSegments)),
+            pixelType_buf(allocBufWrapper<int8_t>(devAcc, maxPixelSegments)),
+            isQuad_buf(allocBufWrapper<char>(devAcc, maxPixelSegments)),
+            isDup_buf(allocBufWrapper<bool>(devAcc, maxPixelSegments)),
+            score_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            charge_buf(allocBufWrapper<int>(devAcc, maxPixelSegments)),
+            seedIdx_buf(allocBufWrapper<unsigned int>(devAcc, maxPixelSegments)),
+            circleCenterX_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            circleCenterY_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            circleRadius_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAcc, maxPixelSegments)),
+            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAcc, maxPixelSegments)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAcc, 1))
+        {
+            mdIndices = alpaka::getPtrNative(mdIndices_buf);
+            innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf);
+            outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf);
+            innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf);
+            outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf);
+            nSegments = alpaka::getPtrNative(nSegments_buf);
+            totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf);
+            dPhis = alpaka::getPtrNative(dPhis_buf);
+            dPhiMins = alpaka::getPtrNative(dPhiMins_buf);
+            dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf);
+            dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf);
+            dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf);
+            dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf);
+            ptIn = alpaka::getPtrNative(ptIn_buf);
+            ptErr = alpaka::getPtrNative(ptErr_buf);
+            px = alpaka::getPtrNative(px_buf);
+            py = alpaka::getPtrNative(py_buf);
+            pz = alpaka::getPtrNative(pz_buf);
+            etaErr = alpaka::getPtrNative(etaErr_buf);
+            eta = alpaka::getPtrNative(eta_buf);
+            phi = alpaka::getPtrNative(phi_buf);
+            superbin = alpaka::getPtrNative(superbin_buf);
+            pixelType = alpaka::getPtrNative(pixelType_buf);
+            isQuad = alpaka::getPtrNative(isQuad_buf);
+            isDup = alpaka::getPtrNative(isDup_buf);
+            score = alpaka::getPtrNative(score_buf);
+            charge = alpaka::getPtrNative(charge_buf);
+            seedIdx = alpaka::getPtrNative(seedIdx_buf);
+            circleCenterX = alpaka::getPtrNative(circleCenterX_buf);
+            circleCenterY = alpaka::getPtrNative(circleCenterY_buf);
+            circleRadius = alpaka::getPtrNative(circleRadius_buf);
+            partOfPT5 = alpaka::getPtrNative(partOfPT5_buf);
+            pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf);
+            nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf);
+
+            alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1);
+            alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1);
+            alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments);
+            alpaka::memset(queue, pLSHitsIdxs_buf, 0u, maxPixelSegments);
+            alpaka::memset(queue, nMemoryLocations_buf, nMemoryLocationsIn, 1);
+            alpaka::wait(queue);
+        }
     };
 
-    void createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int maxSegments, uint16_t nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream);
-
     ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(struct modules& modulesInGPU, unsigned int moduleIndex)
     {
         // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing
@@ -563,8 +686,6 @@ namespace SDL
         }
     };
 
-    void printSegment(struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int segmentIndex);
-
     struct createSegmentsInGPUv2
     {
         template<typename TAcc>

From 8ef1c923422563a607f822a459545ea63f623c02 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 15:21:23 -0400
Subject: [PATCH 03/44] temporary fix for ntuple writing

---
 SDL/Event.cu                  |  8 +++----
 SDL/Event.cuh                 |  4 ++--
 SDL/Segment.cuh               | 43 +++++++++++++++++++++++++++++++++++
 code/core/AccessHelper.cc     |  4 ++--
 code/core/write_sdl_ntuple.cc | 20 ++++++++--------
 5 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index a46c1a9a..aba4381e 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -2114,15 +2114,15 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets()
     return mdsInCPU;
 }
 
-SDL::segments* SDL::Event::getSegments()
+SDL::segments_temp* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
-        segmentsInCPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
-        
+        segmentsInCPU = new SDL::segments_temp;
+
         segmentsInCPU->nSegments = new int[nLowerModules+1];
         cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        
+
         segmentsInCPU->nMemoryLocations = new unsigned int;
         cudaMemcpyAsync(segmentsInCPU->nMemoryLocations, segmentsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
         cudaStreamSynchronize(stream);
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 52e14448..38e6e38e 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -53,7 +53,7 @@ namespace SDL
         objectRanges* rangesInCPU;
         hits* hitsInCPU;
         miniDoublets* mdsInCPU;
-        segments* segmentsInCPU;
+        segments_temp* segmentsInCPU;
         triplets* tripletsInCPU;
         trackCandidates* trackCandidatesInCPU;
         modules* modulesInCPU;
@@ -133,7 +133,7 @@ namespace SDL
         hits* getHits();
         hits* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
-        segments* getSegments() ;
+        segments_temp* getSegments() ;
         triplets* getTriplets();
         quintuplets* getQuintuplets();
         trackCandidates* getTrackCandidates();
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index ff789a16..020ea55a 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -12,6 +12,49 @@ using uint4_Buf = alpaka::Buf<Acc, uint4, Dim1d, Idx>;
 
 namespace SDL
 {
+    // Temporary struct to handle ntuple writing
+    struct segments_temp
+    {
+        unsigned int* nMemoryLocations;
+
+        unsigned int* mdIndices;
+        uint16_t* innerLowerModuleIndices;
+        uint16_t* outerLowerModuleIndices;
+        unsigned int* innerMiniDoubletAnchorHitIndices;
+        unsigned int* outerMiniDoubletAnchorHitIndices;
+
+        int* nSegments; //number of segments per inner lower module
+        int* totOccupancySegments; //number of segments per inner lower module
+        FPX* dPhis;
+        FPX* dPhiMins;
+        FPX* dPhiMaxs;
+        FPX* dPhiChanges;
+        FPX* dPhiChangeMins;
+        FPX* dPhiChangeMaxs;
+
+        //not so optional pixel dudes
+        float* ptIn;
+        float* ptErr;
+        float* px;
+        float* py;
+        float* pz;
+        float* etaErr;
+        float* eta;
+        float* phi;
+        int* charge;
+        unsigned int* seedIdx;
+        int* superbin;
+        int8_t* pixelType;
+        char* isQuad;
+        bool* isDup;
+        float* score;
+        float* circleCenterX;
+        float* circleCenterY;
+        float* circleRadius;
+        bool* partOfPT5;
+        uint4* pLSHitsIdxs;
+    };
+
     struct segments
     {
         // Buffer objects for each member variable
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 2c19eb84..c721316c 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -28,7 +28,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS)
 {
-    SDL::segments& segments_ = *(event->getSegments());
+    SDL::segments_temp& segments_ = *(event->getSegments());
     SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -96,7 +96,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getMDsFromLS(SDL::Event* event, unsigned int LS)
 {
-    SDL::segments& segments_ = *(event->getSegments());
+    SDL::segments_temp& segments_ = *(event->getSegments());
     unsigned int MD_1 = segments_.mdIndices[2 * LS];
     unsigned int MD_2 = segments_.mdIndices[2 * LS + 1];
     return {MD_1, MD_2};
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 824dd9db..da7ff47f 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
     // ============ pT5 =============
     SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
     SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::modules& modulesInGPU = (*event->getModules());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -476,7 +476,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
     SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets());
     SDL::triplets& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
-    SDL::segments& segmentsInGPU = *(event->getSegments());
+    SDL::segments_temp& segmentsInGPU = *(event->getSegments());
     SDL::hits& hitsInGPU = *(event->getHits());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -559,7 +559,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
 void setGnnNtupleBranches(SDL::Event* event)
 {
     // Get relevant information
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -821,7 +821,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::hits& hitsInGPU = (*event->getHits());
 
     //
@@ -959,7 +959,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::hits& hitsInGPU = (*event->getHits());
 
     //
@@ -1059,7 +1059,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepLS(SDL::Event* event, unsigned int idx)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
 
     // Getting pLS index
     unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx];
@@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printLSs(SDL::Event* event)
 {
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printpLSs(SDL::Event* event)
 {
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1238,7 +1238,7 @@ void printpLSs(SDL::Event* event)
 void printT3s(SDL::Event* event)
 {
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments& segmentsInGPU = (*event->getSegments());
+    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     //SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());

From 11372df3ca56f89e5cc4fe704485c47705fa66c0 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 15:53:00 -0400
Subject: [PATCH 04/44] remove unused print util files

---
 SDL/Hit.cuh               |  1 -
 SDL/MiniDoublet.cu        | 25 -------------------------
 SDL/MiniDoublet.cuh       |  2 --
 code/core/Hit.cc          | 10 ----------
 code/core/Hit.h           |  1 -
 {SDL => cpu}/PrintUtil.cc |  0
 {SDL => cpu}/PrintUtil.h  |  0
 7 files changed, 39 deletions(-)
 rename {SDL => cpu}/PrintUtil.cc (100%)
 rename {SDL => cpu}/PrintUtil.h (100%)

diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index fe631b9e..12fc7bf2 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -6,7 +6,6 @@
 #include "Constants.cuh"
 #include "Module.cuh"
 #include "allocate.h"
-#include "PrintUtil.h"
 
 namespace SDL
 {
diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu
index 3cfab8dd..3fd6d23a 100644
--- a/SDL/MiniDoublet.cu
+++ b/SDL/MiniDoublet.cu
@@ -137,28 +137,3 @@ void SDL::miniDoublets::freeMemory(cudaStream_t stream)
     cudaFree(outerHighEdgeX);
     cudaFree(nMemoryLocations);
 }
-
-void SDL::printMD(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, SDL::modules& modulesInGPU, unsigned int mdIndex)
-{
-    std::cout<<std::endl;
-    std::cout << "dz " << mdsInGPU.dzs[mdIndex] << std::endl;
-    std::cout << "dphi " << mdsInGPU.dphis[mdIndex] << std::endl;
-    std::cout << "dphinoshift " << mdsInGPU.noShiftedDphis[mdIndex] << std::endl;
-    std::cout << "dphichange " << mdsInGPU.dphichanges[mdIndex] << std::endl;
-    std::cout << "dphichangenoshift " << mdsInGPU.noShiftedDphiChanges[mdIndex] << std::endl;
-    std::cout << std::endl;
-    std::cout << "Anchor Hit " << std::endl;
-    std::cout << "------------------------------" << std::endl;
-    unsigned int lowerHitIndex = mdsInGPU.anchorHitIndices[mdIndex];
-    unsigned int upperHitIndex = mdsInGPU.outerHitIndices[mdIndex];
-    {
-        IndentingOStreambuf indent(std::cout);
-        printHit(hitsInGPU, modulesInGPU, lowerHitIndex);
-    }
-    std::cout << "Non-anchor Hit " << std::endl;
-    std::cout << "------------------------------" << std::endl;
-    {
-        IndentingOStreambuf indent(std::cout);
-        printHit(hitsInGPU, modulesInGPU, upperHitIndex);
-    }
-}
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index b8db05ca..4f136336 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -68,8 +68,6 @@ namespace SDL
 
     void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
 
-    void printMD(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, SDL::modules& modulesInGPU, unsigned int mdIndex);
-
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
     {
         //the index into which this MD needs to be written will be computed in the kernel
diff --git a/code/core/Hit.cc b/code/core/Hit.cc
index 1be51f19..019f2f90 100644
--- a/code/core/Hit.cc
+++ b/code/core/Hit.cc
@@ -120,19 +120,11 @@ const int& SDL::CPU::Hit::idx() const
 // Set the boundary hits where the hits are shifted
 const SDL::CPU::Hit* SDL::CPU::Hit::getHitHighEdgePtr() const
 {
-    if (not hit_high_edge_)
-    {
-        SDL::CPU::cout << "Error:: hit_high_edge_ does not exist but was asked" << std::endl;
-    }
     return hit_high_edge_;
 }
 
 const SDL::CPU::Hit* SDL::CPU::Hit::getHitLowEdgePtr() const
 {
-    if (not hit_low_edge_)
-    {
-        SDL::CPU::cout << "Error:: hit_low_edge_ does not exist but was asked" << std::endl;
-    }
     return hit_low_edge_;
 }
 
@@ -164,8 +156,6 @@ float SDL::CPU::Hit::deltaPhiChange(const SDL::CPU::Hit& hit) const
 
 bool SDL::CPU::Hit::isIdxMatched(const SDL::CPU::Hit& hit) const
 {
-    if (idx() == -1)
-        SDL::CPU::cout << "Warning:: SDL::CPU::Hit::isIdxMatched() idx of this hit is not set. Cannot perform a match." << std::endl;
     if (hit.idx() == idx())
         return true;
     return false;
diff --git a/code/core/Hit.h b/code/core/Hit.h
index c7fa5dc4..617050d3 100644
--- a/code/core/Hit.h
+++ b/code/core/Hit.h
@@ -6,7 +6,6 @@
 #include <vector>
 
 #include "MathUtil.h"
-#include "PrintUtil.h"
 
 namespace SDL
 {
diff --git a/SDL/PrintUtil.cc b/cpu/PrintUtil.cc
similarity index 100%
rename from SDL/PrintUtil.cc
rename to cpu/PrintUtil.cc
diff --git a/SDL/PrintUtil.h b/cpu/PrintUtil.h
similarity index 100%
rename from SDL/PrintUtil.h
rename to cpu/PrintUtil.h

From b937556b9920bf19c9671c87f764b9cae778a590 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 18:07:05 -0400
Subject: [PATCH 05/44] move remaining buffers to wrapper

---
 SDL/Event.cu | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index aba4381e..e17c19b0 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -741,7 +741,7 @@ struct addPixelSegmentToEventKernel
 
 void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,std::vector<unsigned int> hitIndices1,std::vector<unsigned int> hitIndices2,std::vector<unsigned int> hitIndices3, std::vector<float> dPhiChange, std::vector<float> ptIn, std::vector<float> ptErr, std::vector<float> px, std::vector<float> py, std::vector<float> pz, std::vector<float> eta, std::vector<float> etaErr, std::vector<float> phi, std::vector<int> charge, std::vector<unsigned int> seedIdx, std::vector<int> superbin, std::vector<int8_t> pixelType, std::vector<char> isQuad)
 {
-    int size = ptIn.size();
+    const int size = ptIn.size();
     unsigned int mdSize = 2 * size;
     uint16_t pixelModuleIndex = (*detIdToIndex)[1];
 
@@ -804,13 +804,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         cudaStreamSynchronize(stream);
     }
 
-    alpaka::Vec<Dim1d, Idx> const extent(static_cast<Idx>(size));
-
-    auto hitIndices0_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
-    auto hitIndices1_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
-    auto hitIndices2_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
-    auto hitIndices3_dev = alpaka::allocBuf<unsigned int, Idx>(devAcc, extent);
-    auto dPhiChange_dev = alpaka::allocBuf<float, Idx>(devAcc, extent);
+    auto hitIndices0_dev = allocBufWrapper<unsigned int>(devAcc, size);
+    auto hitIndices1_dev = allocBufWrapper<unsigned int>(devAcc, size);
+    auto hitIndices2_dev = allocBufWrapper<unsigned int>(devAcc, size);
+    auto hitIndices3_dev = allocBufWrapper<unsigned int>(devAcc, size);
+    auto dPhiChange_dev = allocBufWrapper<float>(devAcc, size);
 
     alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size);
     alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size);
@@ -2118,6 +2116,7 @@ SDL::segments_temp* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
+        std::cout << "run" << std::endl;
         segmentsInCPU = new SDL::segments_temp;
 
         segmentsInCPU->nSegments = new int[nLowerModules+1];

From a68cdb91fc510f3fbb96ecc07a08f11504612432 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 18:10:38 -0400
Subject: [PATCH 06/44] debug cleanup

---
 SDL/Event.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index e17c19b0..df31dbed 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -2116,7 +2116,6 @@ SDL::segments_temp* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
-        std::cout << "run" << std::endl;
         segmentsInCPU = new SDL::segments_temp;
 
         segmentsInCPU->nSegments = new int[nLowerModules+1];

From 0286d4340b9973bdd391c36bef84f8ca27e9567a Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 18:13:40 -0400
Subject: [PATCH 07/44] more debug removal

---
 Makefile     | 2 +-
 SDL/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index e4f18272..a49be49e 100644
--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,7 @@ ROOTLIBS    = $(shell root-config --libs)
 ROOTCFLAGS  = $(foreach option, $(shell root-config --cflags), --compiler-options $(option))
 ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr -DALPAKA_DEBUG=0
-CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O0 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
+CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
 EXTRACFLAGS = $(shell rooutil-config)
 EXTRAFLAGS  = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
 DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET
diff --git a/SDL/Makefile b/SDL/Makefile
index c518ee68..abc9a160 100644
--- a/SDL/Makefile
+++ b/SDL/Makefile
@@ -45,7 +45,7 @@ CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG
 	$(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@
 
 %_cpu.o : %.cc %.h
-	$(LD) -O0   $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
+	$(LD) -O2   $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
 
 $(LIB):$(CCOBJECTS) $(CUOBJECTS)
 #$(LIB):$(CUOBJECTS)

From fbccc37912a1e850135d75ba542f585f995cb40f Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 31 May 2023 19:45:45 -0400
Subject: [PATCH 08/44] generalize to host allocations

---
 SDL/Constants.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index b45e45e4..8ce135d7 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -72,6 +72,7 @@ using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
     using Acc = alpaka::AccCpuSerial<Dim, Idx>;
 #endif
 
+auto const devHost = alpaka::getDevByIdx<alpaka::DevCpu>(0u);
 auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
 
@@ -91,7 +92,7 @@ using FPX_circle_Buf = alpaka::Buf<Acc, FPX_circle, Dim1d, Idx>;
 using FPX_seg_Buf = alpaka::Buf<Acc, FPX_seg, Dim1d, Idx>;
 
 template<typename T, typename TAcc, typename TSize>
-alpaka::Buf<Acc, T, Dim1d, Idx> inline allocBufWrapper(TAcc const & devAcc, TSize nElements) {
+alpaka::Buf<TAcc, T, Dim1d, Idx> inline allocBufWrapper(TAcc const & devAcc, TSize nElements) {
     return alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim1d, Idx>(static_cast<Idx>(nElements)));
 }
 

From 9a61b02ebf280a45d3a4af8586c2477a2280d382 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 1 Jun 2023 11:30:26 -0400
Subject: [PATCH 09/44] templated buffer type

---
 SDL/Constants.cuh | 17 ++--------
 SDL/Segment.cuh   | 86 +++++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 58 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 8ce135d7..07a6ba79 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -76,20 +76,9 @@ auto const devHost = alpaka::getDevByIdx<alpaka::DevCpu>(0u);
 auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
 
-// Typical Buffer types used in the code.
-using float_Buf = alpaka::Buf<Acc, float, Dim1d, Idx>;
-using int_Buf = alpaka::Buf<Acc, int, Dim1d, Idx>;
-using uint_Buf = alpaka::Buf<Acc, unsigned int, Dim1d, Idx>;
-using int8_t_Buf = alpaka::Buf<Acc, int8_t, Dim1d, Idx>;
-using uint16_t_Buf = alpaka::Buf<Acc, uint16_t, Dim1d, Idx>;
-using char_Buf = alpaka::Buf<Acc, char, Dim1d, Idx>;
-using bool_Buf = alpaka::Buf<Acc, bool, Dim1d, Idx>;
-
-using FPX_Buf = alpaka::Buf<Acc, FPX, Dim1d, Idx>;
-using FPX_T5_Buf = alpaka::Buf<Acc, FPX_T5, Dim1d, Idx>;
-using FPX_dPhi_Buf = alpaka::Buf<Acc, FPX_dPhi, Dim1d, Idx>;
-using FPX_circle_Buf = alpaka::Buf<Acc, FPX_circle, Dim1d, Idx>;
-using FPX_seg_Buf = alpaka::Buf<Acc, FPX_seg, Dim1d, Idx>;
+// Buffer type for allocations where auto type can't be used.
+template<typename TAcc, typename TData>
+using Buf = alpaka::Buf<TAcc, TData, Dim1d, Idx>;
 
 template<typename T, typename TAcc, typename TSize>
 alpaka::Buf<TAcc, T, Dim1d, Idx> inline allocBufWrapper(TAcc const & devAcc, TSize nElements) {
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 020ea55a..2dd34f0e 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -8,8 +8,6 @@
 #include "Module.cuh"
 #include "Hit.cuh"
 
-using uint4_Buf = alpaka::Buf<Acc, uint4, Dim1d, Idx>;
-
 namespace SDL
 {
     // Temporary struct to handle ntuple writing
@@ -58,48 +56,48 @@ namespace SDL
     struct segments
     {
         // Buffer objects for each member variable
-        FPX_Buf dPhis_buf;
-        FPX_Buf dPhiMins_buf;
-        FPX_Buf dPhiMaxs_buf;
-        FPX_Buf dPhiChanges_buf;
-        FPX_Buf dPhiChangeMins_buf;
-        FPX_Buf dPhiChangeMaxs_buf;
-
-        uint16_t_Buf innerLowerModuleIndices_buf;
-        uint16_t_Buf outerLowerModuleIndices_buf;
-
-        uint_Buf seedIdx_buf;
-        uint_Buf mdIndices_buf;
-        uint_Buf innerMiniDoubletAnchorHitIndices_buf;
-        uint_Buf outerMiniDoubletAnchorHitIndices_buf;
-        uint_Buf nMemoryLocations_buf;
-
-        int_Buf nSegments_buf;
-        int_Buf totOccupancySegments_buf;
-        int_Buf charge_buf;
-        int_Buf superbin_buf;
-
-        uint4_Buf pLSHitsIdxs_buf;
-
-        int8_t_Buf pixelType_buf;
-
-        char_Buf isQuad_buf;
-
-        bool_Buf isDup_buf;
-        bool_Buf partOfPT5_buf;
-
-        float_Buf ptIn_buf;
-        float_Buf ptErr_buf;
-        float_Buf px_buf;
-        float_Buf py_buf;
-        float_Buf pz_buf;
-        float_Buf etaErr_buf;
-        float_Buf eta_buf;
-        float_Buf phi_buf;
-        float_Buf score_buf;
-        float_Buf circleCenterX_buf;
-        float_Buf circleCenterY_buf;
-        float_Buf circleRadius_buf;
+        Buf<Acc, FPX> dPhis_buf;
+        Buf<Acc, FPX> dPhiMins_buf;
+        Buf<Acc, FPX> dPhiMaxs_buf;
+        Buf<Acc, FPX> dPhiChanges_buf;
+        Buf<Acc, FPX> dPhiChangeMins_buf;
+        Buf<Acc, FPX> dPhiChangeMaxs_buf;
+
+        Buf<Acc, uint16_t> innerLowerModuleIndices_buf;
+        Buf<Acc, uint16_t> outerLowerModuleIndices_buf;
+
+        Buf<Acc, unsigned int> seedIdx_buf;
+        Buf<Acc, unsigned int> mdIndices_buf;
+        Buf<Acc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
+        Buf<Acc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
+        Buf<Acc, unsigned int> nMemoryLocations_buf;
+
+        Buf<Acc, int> nSegments_buf;
+        Buf<Acc, int> totOccupancySegments_buf;
+        Buf<Acc, int> charge_buf;
+        Buf<Acc, int> superbin_buf;
+
+        Buf<Acc, uint4> pLSHitsIdxs_buf; // Please ensure that the 'uint4' type is defined and available in your scope.
+
+        Buf<Acc, int8_t> pixelType_buf;
+
+        Buf<Acc, char> isQuad_buf;
+
+        Buf<Acc, bool> isDup_buf;
+        Buf<Acc, bool> partOfPT5_buf;
+
+        Buf<Acc, float> ptIn_buf;
+        Buf<Acc, float> ptErr_buf;
+        Buf<Acc, float> px_buf;
+        Buf<Acc, float> py_buf;
+        Buf<Acc, float> pz_buf;
+        Buf<Acc, float> etaErr_buf;
+        Buf<Acc, float> eta_buf;
+        Buf<Acc, float> phi_buf;
+        Buf<Acc, float> score_buf;
+        Buf<Acc, float> circleCenterX_buf;
+        Buf<Acc, float> circleCenterY_buf;
+        Buf<Acc, float> circleRadius_buf;
 
         // Pointers towards the data of each buffer
         FPX* dPhis;

From 96d3b8b1815d65c624ddd600d8f9ae508f9afed2 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 1 Jun 2023 15:33:57 -0400
Subject: [PATCH 10/44] working ntuple writing with templated segments

---
 SDL/Constants.cuh             |   4 +-
 SDL/Event.cu                  |  79 ++++---------
 SDL/Event.cuh                 |   6 +-
 SDL/Kernels.cuh               |   5 +-
 SDL/PixelTriplet.cuh          |  26 +++--
 SDL/Quintuplet.cuh            |  15 +--
 SDL/Segment.cuh               | 209 ++++++++++++++--------------------
 SDL/TrackCandidate.cuh        |  13 ++-
 SDL/Triplet.cuh               |  33 +++---
 code/core/AccessHelper.cc     |   4 +-
 code/core/write_sdl_ntuple.cc |  20 ++--
 11 files changed, 176 insertions(+), 238 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 07a6ba79..007e022c 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -81,8 +81,8 @@ template<typename TAcc, typename TData>
 using Buf = alpaka::Buf<TAcc, TData, Dim1d, Idx>;
 
 template<typename T, typename TAcc, typename TSize>
-alpaka::Buf<TAcc, T, Dim1d, Idx> inline allocBufWrapper(TAcc const & devAcc, TSize nElements) {
-    return alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim1d, Idx>(static_cast<Idx>(nElements)));
+Buf<TAcc, T> inline allocBufWrapper(TAcc const & devAccIn, TSize nElements) {
+    return alpaka::allocBuf<T, Idx>(devAccIn, Vec1d(static_cast<Idx>(nElements)));
 }
 
 const unsigned int MAX_BLOCKS = 80;
diff --git a/SDL/Event.cu b/SDL/Event.cu
index df31dbed..9db88f40 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -118,15 +118,6 @@ SDL::Event::~Event()
 
     if(segmentsInCPU != nullptr)
     {
-        delete[] segmentsInCPU->mdIndices;
-        delete[] segmentsInCPU->nSegments;
-        delete[] segmentsInCPU->totOccupancySegments;
-        delete[] segmentsInCPU->innerMiniDoubletAnchorHitIndices;
-        delete[] segmentsInCPU->outerMiniDoubletAnchorHitIndices;
-        delete[] segmentsInCPU->ptIn;
-        delete[] segmentsInCPU->eta;
-        delete[] segmentsInCPU->phi;
-        delete segmentsInCPU->nMemoryLocations;
         delete segmentsInCPU;
     }
 
@@ -345,14 +336,6 @@ void SDL::Event::resetEvent()
 
     if(segmentsInCPU != nullptr)
     {
-        delete[] segmentsInCPU->mdIndices;
-        delete[] segmentsInCPU->nSegments;
-        delete[] segmentsInCPU->totOccupancySegments;
-        delete[] segmentsInCPU->innerMiniDoubletAnchorHitIndices;
-        delete[] segmentsInCPU->outerMiniDoubletAnchorHitIndices;
-        delete[] segmentsInCPU->ptIn;
-        delete[] segmentsInCPU->eta;
-        delete[] segmentsInCPU->phi;
         delete segmentsInCPU;
         segmentsInCPU = nullptr;
     }
@@ -698,7 +681,7 @@ struct addPixelSegmentToEventKernel
         struct SDL::objectRanges& rangesInGPU,
         struct SDL::hits& hitsInGPU,
         struct SDL::miniDoublets& mdsInGPU,
-        struct SDL::segments& segmentsInGPU,
+        SDL::segments<TAcc>& segmentsInGPU,
         unsigned int* hitIndices0,
         unsigned int* hitIndices1,
         unsigned int* hitIndices2,
@@ -798,7 +781,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         cudaStreamSynchronize(stream);
         nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
 
-        segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU = new SDL::segments<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
 
         cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
         cudaStreamSynchronize(stream);
@@ -1043,7 +1026,7 @@ void SDL::Event::createSegmentsWithModuleMap()
 {
     if(segmentsInGPU == nullptr)
     {
-        segmentsInGPU = new SDL::segments(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU = new SDL::segments<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
     }
 
     Vec const threadsPerBlockCreateSeg(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(64));
@@ -2112,44 +2095,32 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets()
     return mdsInCPU;
 }
 
-SDL::segments_temp* SDL::Event::getSegments()
+SDL::segments<alpaka::DevCpu>* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
-        segmentsInCPU = new SDL::segments_temp;
-
-        segmentsInCPU->nSegments = new int[nLowerModules+1];
-        cudaMemcpyAsync(segmentsInCPU->nSegments, segmentsInGPU->nSegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream);
-
-        segmentsInCPU->nMemoryLocations = new unsigned int;
-        cudaMemcpyAsync(segmentsInCPU->nMemoryLocations, segmentsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
+        // Get nMemoryLocations parameter to initilize host based segmentsInCPU
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nMemLocal_buf, segmentsInGPU->nMemoryLocations_buf, 1);
+        alpaka::wait(queue);
 
-        segmentsInCPU->mdIndices = new unsigned int[2 * *(segmentsInCPU->nMemoryLocations)];
-        segmentsInCPU->innerMiniDoubletAnchorHitIndices = new unsigned int[*(segmentsInCPU->nMemoryLocations)];
-        segmentsInCPU->outerMiniDoubletAnchorHitIndices = new unsigned int[*(segmentsInCPU->nMemoryLocations)];
-        segmentsInCPU->totOccupancySegments = new int[nLowerModules+1];
-
-        segmentsInCPU->ptIn = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->eta = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->phi = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->seedIdx = new unsigned int[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->isDup = new bool[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->isQuad = new char[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-        segmentsInCPU->score = new float[N_MAX_PIXEL_SEGMENTS_PER_MODULE];
-
-        cudaMemcpyAsync(segmentsInCPU->mdIndices, segmentsInGPU->mdIndices, 2 * *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->innerMiniDoubletAnchorHitIndices, segmentsInGPU->innerMiniDoubletAnchorHitIndices, *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->outerMiniDoubletAnchorHitIndices, segmentsInGPU->outerMiniDoubletAnchorHitIndices, *(segmentsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->totOccupancySegments, segmentsInGPU->totOccupancySegments, (nLowerModules+1) * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->ptIn, segmentsInGPU->ptIn, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->eta, segmentsInGPU->eta, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->phi, segmentsInGPU->phi, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->seedIdx, segmentsInGPU->seedIdx, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->isDup, segmentsInGPU->isDup, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(bool), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->isQuad, segmentsInGPU->isQuad, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(char), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(segmentsInCPU->score, segmentsInGPU->score, N_MAX_PIXEL_SEGMENTS_PER_MODULE * sizeof(float), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
+        segmentsInCPU = new SDL::segments<alpaka::DevCpu>(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue);
+
+        *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemLocal;
+        alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsInGPU->nSegments_buf, (nLowerModules+1));
+        alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsInGPU->mdIndices_buf, 2 * nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->innerMiniDoubletAnchorHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->outerMiniDoubletAnchorHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsInGPU->totOccupancySegments_buf, (nLowerModules+1));
+        alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsInGPU->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsInGPU->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsInGPU->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsInGPU->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsInGPU->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsInGPU->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsInGPU->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::wait(queue);
     }
     return segmentsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 38e6e38e..f5b8327c 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -42,7 +42,7 @@ namespace SDL
         struct objectRanges* rangesInGPU;
         struct hits* hitsInGPU;
         struct miniDoublets* mdsInGPU;
-        struct segments* segmentsInGPU;
+        struct segments<Acc>* segmentsInGPU;
         struct triplets* tripletsInGPU;
         struct quintuplets* quintupletsInGPU;
         struct trackCandidates* trackCandidatesInGPU;
@@ -53,7 +53,7 @@ namespace SDL
         objectRanges* rangesInCPU;
         hits* hitsInCPU;
         miniDoublets* mdsInCPU;
-        segments_temp* segmentsInCPU;
+        segments<alpaka::DevCpu>* segmentsInCPU;
         triplets* tripletsInCPU;
         trackCandidates* trackCandidatesInCPU;
         modules* modulesInCPU;
@@ -133,7 +133,7 @@ namespace SDL
         hits* getHits();
         hits* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
-        segments_temp* getSegments() ;
+        segments<alpaka::DevCpu>* getSegments() ;
         triplets* getTriplets();
         quintuplets* getQuintuplets();
         trackCandidates* getTrackCandidates();
diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index 8fd7d952..068e66f6 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -27,7 +27,8 @@ namespace SDL
         pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = 1;
     };
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(SDL::segments<TAcc>& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
     {
         segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1;
     };
@@ -452,7 +453,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 bool secondpass) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 0f884bae..d8d37fef 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -48,7 +48,8 @@ namespace SDL
 
     void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
     {
         pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex;
         pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex;
@@ -130,7 +131,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
@@ -663,7 +664,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
     {
         bool pass = true;
 
@@ -768,7 +769,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 unsigned int* connectedPixelSize,
@@ -911,7 +912,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -1124,7 +1125,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -1385,7 +1386,8 @@ namespace SDL
 
     void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream);
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
     {
         pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex;
         pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index;
@@ -1966,7 +1968,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
     {
         bool pass = true;
 
@@ -2100,7 +2102,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
@@ -2226,7 +2228,7 @@ namespace SDL
     };
  
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -2433,7 +2435,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -2648,7 +2650,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index e6384152..1a34d763 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -644,7 +644,8 @@ namespace SDL
         return true;
     };
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, SDL::segments<TAcc>& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
     {
         unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1];
         unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex];
@@ -1204,7 +1205,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -1397,7 +1398,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -1608,7 +1609,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1815,7 +1816,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1873,7 +1874,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
     {
         bool pass = true;
         unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex];
@@ -2077,7 +2078,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 2dd34f0e..f7e1104e 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -10,94 +10,52 @@
 
 namespace SDL
 {
-    // Temporary struct to handle ntuple writing
-    struct segments_temp
-    {
-        unsigned int* nMemoryLocations;
-
-        unsigned int* mdIndices;
-        uint16_t* innerLowerModuleIndices;
-        uint16_t* outerLowerModuleIndices;
-        unsigned int* innerMiniDoubletAnchorHitIndices;
-        unsigned int* outerMiniDoubletAnchorHitIndices;
-
-        int* nSegments; //number of segments per inner lower module
-        int* totOccupancySegments; //number of segments per inner lower module
-        FPX* dPhis;
-        FPX* dPhiMins;
-        FPX* dPhiMaxs;
-        FPX* dPhiChanges;
-        FPX* dPhiChangeMins;
-        FPX* dPhiChangeMaxs;
-
-        //not so optional pixel dudes
-        float* ptIn;
-        float* ptErr;
-        float* px;
-        float* py;
-        float* pz;
-        float* etaErr;
-        float* eta;
-        float* phi;
-        int* charge;
-        unsigned int* seedIdx;
-        int* superbin;
-        int8_t* pixelType;
-        char* isQuad;
-        bool* isDup;
-        float* score;
-        float* circleCenterX;
-        float* circleCenterY;
-        float* circleRadius;
-        bool* partOfPT5;
-        uint4* pLSHitsIdxs;
-    };
-
+    template<typename TAcc>
     struct segments
     {
         // Buffer objects for each member variable
-        Buf<Acc, FPX> dPhis_buf;
-        Buf<Acc, FPX> dPhiMins_buf;
-        Buf<Acc, FPX> dPhiMaxs_buf;
-        Buf<Acc, FPX> dPhiChanges_buf;
-        Buf<Acc, FPX> dPhiChangeMins_buf;
-        Buf<Acc, FPX> dPhiChangeMaxs_buf;
-
-        Buf<Acc, uint16_t> innerLowerModuleIndices_buf;
-        Buf<Acc, uint16_t> outerLowerModuleIndices_buf;
-
-        Buf<Acc, unsigned int> seedIdx_buf;
-        Buf<Acc, unsigned int> mdIndices_buf;
-        Buf<Acc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
-        Buf<Acc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
-        Buf<Acc, unsigned int> nMemoryLocations_buf;
-
-        Buf<Acc, int> nSegments_buf;
-        Buf<Acc, int> totOccupancySegments_buf;
-        Buf<Acc, int> charge_buf;
-        Buf<Acc, int> superbin_buf;
-
-        Buf<Acc, uint4> pLSHitsIdxs_buf; // Please ensure that the 'uint4' type is defined and available in your scope.
-
-        Buf<Acc, int8_t> pixelType_buf;
-
-        Buf<Acc, char> isQuad_buf;
-
-        Buf<Acc, bool> isDup_buf;
-        Buf<Acc, bool> partOfPT5_buf;
-
-        Buf<Acc, float> ptIn_buf;
-        Buf<Acc, float> ptErr_buf;
-        Buf<Acc, float> px_buf;
-        Buf<Acc, float> py_buf;
-        Buf<Acc, float> pz_buf;
-        Buf<Acc, float> etaErr_buf;
-        Buf<Acc, float> eta_buf;
-        Buf<Acc, float> phi_buf;
-        Buf<Acc, float> score_buf;
-        Buf<Acc, float> circleCenterX_buf;
-        Buf<Acc, float> circleCenterY_buf;
-        Buf<Acc, float> circleRadius_buf;
+        Buf<TAcc, FPX> dPhis_buf;
+        Buf<TAcc, FPX> dPhiMins_buf;
+        Buf<TAcc, FPX> dPhiMaxs_buf;
+        Buf<TAcc, FPX> dPhiChanges_buf;
+        Buf<TAcc, FPX> dPhiChangeMins_buf;
+        Buf<TAcc, FPX> dPhiChangeMaxs_buf;
+
+        Buf<TAcc, uint16_t> innerLowerModuleIndices_buf;
+        Buf<TAcc, uint16_t> outerLowerModuleIndices_buf;
+
+        Buf<TAcc, unsigned int> seedIdx_buf;
+        Buf<TAcc, unsigned int> mdIndices_buf;
+        Buf<TAcc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
+        Buf<TAcc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
+
+        Buf<TAcc, int> nSegments_buf;
+        Buf<TAcc, int> totOccupancySegments_buf;
+        Buf<TAcc, int> charge_buf;
+        Buf<TAcc, int> superbin_buf;
+
+        Buf<TAcc, uint4> pLSHitsIdxs_buf;
+
+        Buf<TAcc, int8_t> pixelType_buf;
+
+        Buf<TAcc, char> isQuad_buf;
+
+        Buf<TAcc, bool> isDup_buf;
+        Buf<TAcc, bool> partOfPT5_buf;
+
+        Buf<TAcc, float> ptIn_buf;
+        Buf<TAcc, float> ptErr_buf;
+        Buf<TAcc, float> px_buf;
+        Buf<TAcc, float> py_buf;
+        Buf<TAcc, float> pz_buf;
+        Buf<TAcc, float> etaErr_buf;
+        Buf<TAcc, float> eta_buf;
+        Buf<TAcc, float> phi_buf;
+        Buf<TAcc, float> score_buf;
+        Buf<TAcc, float> circleCenterX_buf;
+        Buf<TAcc, float> circleCenterY_buf;
+        Buf<TAcc, float> circleRadius_buf;
 
         // Pointers towards the data of each buffer
         FPX* dPhis;
@@ -139,46 +97,46 @@ namespace SDL
         bool* partOfPT5;
         uint4* pLSHitsIdxs;
 
-        template<typename TAcc, typename TQueue>
+        template<typename TQueue, typename TDevAcc>
         segments(unsigned int nMemoryLocationsIn,
                     uint16_t nLowerModules,
                     unsigned int maxPixelSegments,
-                    TAcc const & devAcc,
+                    TDevAcc const & devAccIn,
                     TQueue& queue) :
-            mdIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn*2)),
-            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn)),
-            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAcc, nMemoryLocationsIn)),
-            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAcc, nMemoryLocationsIn)),
-            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAcc, nMemoryLocationsIn)),
-            nSegments_buf(allocBufWrapper<int>(devAcc, nLowerModules + 1)),
-            totOccupancySegments_buf(allocBufWrapper<int>(devAcc, nLowerModules + 1)),
-            dPhis_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            dPhiMins_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            dPhiMaxs_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            dPhiChanges_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            dPhiChangeMins_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAcc, nMemoryLocationsIn)),
-            ptIn_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            ptErr_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            px_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            py_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            pz_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            etaErr_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            eta_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            phi_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            superbin_buf(allocBufWrapper<int>(devAcc, maxPixelSegments)),
-            pixelType_buf(allocBufWrapper<int8_t>(devAcc, maxPixelSegments)),
-            isQuad_buf(allocBufWrapper<char>(devAcc, maxPixelSegments)),
-            isDup_buf(allocBufWrapper<bool>(devAcc, maxPixelSegments)),
-            score_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            charge_buf(allocBufWrapper<int>(devAcc, maxPixelSegments)),
-            seedIdx_buf(allocBufWrapper<unsigned int>(devAcc, maxPixelSegments)),
-            circleCenterX_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            circleCenterY_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            circleRadius_buf(allocBufWrapper<float>(devAcc, maxPixelSegments)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAcc, maxPixelSegments)),
-            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAcc, maxPixelSegments)),
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAcc, 1))
+            mdIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn*2)),
+            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
+            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
+            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
+            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
+            nSegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
+            totOccupancySegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
+            dPhis_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            dPhiMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            dPhiMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            dPhiChanges_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            dPhiChangeMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            ptIn_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            ptErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            px_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            py_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            pz_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            etaErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            eta_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            phi_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            superbin_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
+            pixelType_buf(allocBufWrapper<int8_t>(devAccIn, maxPixelSegments)),
+            isQuad_buf(allocBufWrapper<char>(devAccIn, maxPixelSegments)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
+            score_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            charge_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
+            seedIdx_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelSegments)),
+            circleCenterX_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            circleCenterY_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
+            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAccIn, maxPixelSegments)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1))
         {
             mdIndices = alpaka::getPtrNative(mdIndices_buf);
             innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf);
@@ -481,7 +439,8 @@ namespace SDL
         dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls);
     };
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(SDL::segments<TAcc>& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
     {
         //idx will be computed in the kernel, which is the index into which the 
         //segment will be written
@@ -503,7 +462,7 @@ namespace SDL
     }
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
     {
         segmentsInGPU.mdIndices[idx * 2] = innerMDIndex;
         segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex;
@@ -734,7 +693,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -886,7 +845,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct modules& modulesInGPU,
-                struct segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index d81a570d..2e48c48c 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -81,7 +81,8 @@ namespace SDL
         trackCandidatesInGPU.radius[trackCandidateIndex]  = __F2H(radius);
     };
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits& hitsInGPU)
     {
         int phits1[4] = {-1,-1,-1,-1};
         int phits2[4] = {-1,-1,-1,-1};
@@ -127,7 +128,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -242,7 +243,7 @@ namespace SDL
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
                 struct SDL::hits& hitsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU) const
@@ -327,7 +328,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -398,7 +399,7 @@ namespace SDL
                 TAcc const & acc,
                 uint16_t nLowerModules,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments& segmentsInGPU) const
+                SDL::segments<TAcc>& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
@@ -428,7 +429,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 16ea085d..c548ee21 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -59,9 +59,11 @@ namespace SDL
     void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream);
 
 #ifdef CUT_VALUE_DEBUG
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
 #else
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
 #endif
     {
         tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex;
@@ -108,7 +110,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
     {
         //get the rt and z
         const float& r1 = mdsInGPU.anchorRt[firstMDIndex];
@@ -189,7 +191,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -248,7 +250,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex;
@@ -327,7 +329,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -407,7 +409,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex];
         short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex];
@@ -442,7 +444,8 @@ namespace SDL
         return false; // failsafe    
     };
 
-    void printTriplet(struct triplets& tripletsInGPU, struct segments& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex);
+    template<typename TAcc>
+    void printTriplet(struct triplets& tripletsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex);
 
     template<typename TAcc>
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn)
@@ -490,7 +493,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -686,7 +689,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -901,7 +904,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1110,7 +1113,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1175,7 +1178,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1202,7 +1205,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 uint16_t *index_gpu,
@@ -1277,7 +1280,7 @@ namespace SDL
                 TAcc const & acc,
                 struct modules& modulesInGPU,
                 struct objectRanges& rangesInGPU,
-                struct segments& segmentsInGPU) const
+                SDL::segments<TAcc>& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index c721316c..2bf534b2 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -28,7 +28,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS)
 {
-    SDL::segments_temp& segments_ = *(event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segments_ = *(event->getSegments());
     SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -96,7 +96,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getMDsFromLS(SDL::Event* event, unsigned int LS)
 {
-    SDL::segments_temp& segments_ = *(event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segments_ = *(event->getSegments());
     unsigned int MD_1 = segments_.mdIndices[2 * LS];
     unsigned int MD_2 = segments_.mdIndices[2 * LS + 1];
     return {MD_1, MD_2};
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index da7ff47f..dcc9f070 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
     // ============ pT5 =============
     SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
     SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::modules& modulesInGPU = (*event->getModules());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -476,7 +476,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
     SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets());
     SDL::triplets& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
-    SDL::segments_temp& segmentsInGPU = *(event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
     SDL::hits& hitsInGPU = *(event->getHits());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -559,7 +559,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
 void setGnnNtupleBranches(SDL::Event* event)
 {
     // Get relevant information
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -821,7 +821,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hits& hitsInGPU = (*event->getHits());
 
     //
@@ -959,7 +959,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hits& hitsInGPU = (*event->getHits());
 
     //
@@ -1059,7 +1059,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepLS(SDL::Event* event, unsigned int idx)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
 
     // Getting pLS index
     unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx];
@@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printLSs(SDL::Event* event)
 {
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printpLSs(SDL::Event* event)
 {
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1238,7 +1238,7 @@ void printpLSs(SDL::Event* event)
 void printT3s(SDL::Event* event)
 {
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments_temp& segmentsInGPU = (*event->getSegments());
+    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     //SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());

From 7670310e9521af2c3fc3b03683a7bf6e88f2aced Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 1 Jun 2023 18:04:45 -0400
Subject: [PATCH 11/44] formatting fixes

---
 SDL/Constants.cuh      |  2 +-
 SDL/Event.cu           |  2 +-
 SDL/Kernels.cuh        |  4 ++--
 SDL/PixelTriplet.cuh   | 24 ++++++++++++------------
 SDL/Quintuplet.cuh     | 14 +++++++-------
 SDL/Segment.cuh        |  6 +++---
 SDL/TrackCandidate.cuh | 12 ++++++------
 SDL/Triplet.cuh        | 30 +++++++++++++++---------------
 8 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 007e022c..7dfbbbe8 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -81,7 +81,7 @@ template<typename TAcc, typename TData>
 using Buf = alpaka::Buf<TAcc, TData, Dim1d, Idx>;
 
 template<typename T, typename TAcc, typename TSize>
-Buf<TAcc, T> inline allocBufWrapper(TAcc const & devAccIn, TSize nElements) {
+ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf<TAcc, T> allocBufWrapper(TAcc const & devAccIn, TSize nElements) {
     return alpaka::allocBuf<T, Idx>(devAccIn, Vec1d(static_cast<Idx>(nElements)));
 }
 
diff --git a/SDL/Event.cu b/SDL/Event.cu
index 9db88f40..d7a89ec6 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -681,7 +681,7 @@ struct addPixelSegmentToEventKernel
         struct SDL::objectRanges& rangesInGPU,
         struct SDL::hits& hitsInGPU,
         struct SDL::miniDoublets& mdsInGPU,
-        SDL::segments<TAcc>& segmentsInGPU,
+        struct SDL::segments<TAcc>& segmentsInGPU,
         unsigned int* hitIndices0,
         unsigned int* hitIndices1,
         unsigned int* hitIndices2,
diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index 068e66f6..4cc1310d 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -28,7 +28,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(SDL::segments<TAcc>& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments<TAcc>& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
     {
         segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1;
     };
@@ -453,7 +453,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 bool secondpass) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index d8d37fef..f7aaa54e 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -49,7 +49,7 @@ namespace SDL
     void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
     {
         pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex;
         pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex;
@@ -131,7 +131,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
@@ -664,7 +664,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
     {
         bool pass = true;
 
@@ -769,7 +769,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 unsigned int* connectedPixelSize,
@@ -912,7 +912,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -1125,7 +1125,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -1387,7 +1387,7 @@ namespace SDL
     void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream);
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
     {
         pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex;
         pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index;
@@ -1968,7 +1968,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
     {
         bool pass = true;
 
@@ -2102,7 +2102,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
@@ -2228,7 +2228,7 @@ namespace SDL
     };
  
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -2435,7 +2435,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -2650,7 +2650,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index 1a34d763..b4fe6b3d 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -645,7 +645,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, SDL::segments<TAcc>& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
     {
         unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1];
         unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex];
@@ -1205,7 +1205,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -1398,7 +1398,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -1609,7 +1609,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1816,7 +1816,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1874,7 +1874,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
     {
         bool pass = true;
         unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex];
@@ -2078,7 +2078,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index f7e1104e..5eb74b30 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -440,7 +440,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(SDL::segments<TAcc>& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments<TAcc>& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
     {
         //idx will be computed in the kernel, which is the index into which the 
         //segment will be written
@@ -462,7 +462,7 @@ namespace SDL
     }
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
     {
         segmentsInGPU.mdIndices[idx * 2] = innerMDIndex;
         segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex;
@@ -845,7 +845,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct modules& modulesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index 2e48c48c..738037fa 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -82,7 +82,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits& hitsInGPU)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits& hitsInGPU)
     {
         int phits1[4] = {-1,-1,-1,-1};
         int phits2[4] = {-1,-1,-1,-1};
@@ -128,7 +128,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -243,7 +243,7 @@ namespace SDL
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
                 struct SDL::hits& hitsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU) const
@@ -328,7 +328,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -399,7 +399,7 @@ namespace SDL
                 TAcc const & acc,
                 uint16_t nLowerModules,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU) const
+                struct SDL::segments<TAcc>& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
@@ -429,7 +429,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index c548ee21..5baa5a3f 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -60,10 +60,10 @@ namespace SDL
 
 #ifdef CUT_VALUE_DEBUG
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
 #else
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
 #endif
     {
         tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex;
@@ -110,7 +110,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
     {
         //get the rt and z
         const float& r1 = mdsInGPU.anchorRt[firstMDIndex];
@@ -191,7 +191,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -250,7 +250,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex;
@@ -329,7 +329,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -409,7 +409,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex];
         short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex];
@@ -445,7 +445,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    void printTriplet(struct triplets& tripletsInGPU, SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex);
+    void printTriplet(struct triplets& tripletsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex);
 
     template<typename TAcc>
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn)
@@ -493,7 +493,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -689,7 +689,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -904,7 +904,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1113,7 +1113,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1178,7 +1178,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1205,7 +1205,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 uint16_t *index_gpu,
@@ -1280,7 +1280,7 @@ namespace SDL
                 TAcc const & acc,
                 struct modules& modulesInGPU,
                 struct objectRanges& rangesInGPU,
-                SDL::segments<TAcc>& segmentsInGPU) const
+                struct SDL::segments<TAcc>& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;

From 955c3f027c16adf4856726873dc901487b27e2fe Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 1 Jun 2023 18:19:22 -0400
Subject: [PATCH 12/44] remove extra alpaka flags

---
 Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a49be49e..44c520d9 100644
--- a/Makefile
+++ b/Makefile
@@ -20,10 +20,9 @@ LDFLAGS     = -g -O2
 ROOTLIBS    = $(shell root-config --libs)
 ROOTCFLAGS  = $(foreach option, $(shell root-config --cflags), --compiler-options $(option))
 ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-ALPAKAFLAGS = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr -DALPAKA_DEBUG=0
-CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
+CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp
 EXTRACFLAGS = $(shell rooutil-config)
-EXTRAFLAGS  = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include
+EXTRAFLAGS  = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp
 DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET
 PT0P8       =
 T3T3EXTENSION=

From e4827f6a2bf4927c7b03202b47064e5920e5bc24 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 1 Jun 2023 18:51:13 -0400
Subject: [PATCH 13/44] move elementsPerThread to Constants.cuh

---
 SDL/Constants.cuh | 2 ++
 SDL/Event.cu      | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 7dfbbbe8..66a874a7 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -58,6 +58,8 @@ using Vec1d = alpaka::Vec<Dim1d,Idx>;
 using QueueProperty = alpaka::NonBlocking;
 using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
 
+Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
+
 // - AccGpuCudaRt
 // - AccCpuThreads
 // - AccCpuFibers
diff --git a/SDL/Event.cu b/SDL/Event.cu
index d7a89ec6..ae1e9d00 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -5,9 +5,6 @@ struct SDL::pixelMap* SDL::pixelMapping = nullptr;
 uint16_t SDL::nModules;
 uint16_t SDL::nLowerModules;
 
-// Temporary alpaka statements
-Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-
 SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx<Acc>(0u))
 {
     int version;

From 1813b982e08ea48aa6fe74df59cd41f658d8fd21 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sat, 3 Jun 2023 12:31:41 -0400
Subject: [PATCH 14/44] fix cuda Malloc/Free bug, formatting fixes

---
 SDL/Event.cu    |   2 +-
 SDL/Segment.cuh | 100 +++++++++++++++++++++++++++---------------------
 2 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index ae1e9d00..694399d5 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -80,7 +80,7 @@ SDL::Event::~Event()
 #endif
     if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);}
     if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
-    if(segmentsInGPU!= nullptr){cms::cuda::free_host(segmentsInGPU);}
+    if(segmentsInGPU != nullptr){delete segmentsInGPU;}
     if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);}
     if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);}
     if(hitsInGPU!= nullptr){cms::cuda::free_host(hitsInGPU);}
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 5eb74b30..065bee14 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -26,14 +26,14 @@ namespace SDL
 
         Buf<TAcc, unsigned int> seedIdx_buf;
         Buf<TAcc, unsigned int> mdIndices_buf;
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
         Buf<TAcc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
         Buf<TAcc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
-        Buf<TAcc, unsigned int> nMemoryLocations_buf;
 
-        Buf<TAcc, int> nSegments_buf;
-        Buf<TAcc, int> totOccupancySegments_buf;
         Buf<TAcc, int> charge_buf;
         Buf<TAcc, int> superbin_buf;
+        Buf<TAcc, int> nSegments_buf;
+        Buf<TAcc, int> totOccupancySegments_buf;
 
         Buf<TAcc, uint4> pLSHitsIdxs_buf;
 
@@ -68,14 +68,26 @@ namespace SDL
         uint16_t* innerLowerModuleIndices;
         uint16_t* outerLowerModuleIndices;
 
+        unsigned int* seedIdx;
         unsigned int* mdIndices;
         unsigned int* nMemoryLocations;
         unsigned int* innerMiniDoubletAnchorHitIndices;
         unsigned int* outerMiniDoubletAnchorHitIndices;
 
+        int* charge;
+        int* superbin;
         int* nSegments; //number of segments per inner lower module
         int* totOccupancySegments; //number of segments per inner lower module
 
+        uint4* pLSHitsIdxs;
+
+        int8_t* pixelType;
+
+        char* isQuad;
+
+        bool* isDup;
+        bool* partOfPT5;
+
         float* ptIn;
         float* ptErr;
         float* px;
@@ -84,18 +96,10 @@ namespace SDL
         float* etaErr;
         float* eta;
         float* phi;
-        int* charge;
-        unsigned int* seedIdx;
-        int* superbin;
-        int8_t* pixelType;
-        char* isQuad;
-        bool* isDup;
         float* score;
         float* circleCenterX;
         float* circleCenterY;
         float* circleRadius;
-        bool* partOfPT5;
-        uint4* pLSHitsIdxs;
 
         template<typename TQueue, typename TDevAcc>
         segments(unsigned int nMemoryLocationsIn,
@@ -103,19 +107,28 @@ namespace SDL
                     unsigned int maxPixelSegments,
                     TDevAcc const & devAccIn,
                     TQueue& queue) :
-            mdIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn*2)),
-            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
-            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
-            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
-            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
-            nSegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
-            totOccupancySegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
             dPhis_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiChanges_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiChangeMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
+            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
+            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
+            seedIdx_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelSegments)),
+            mdIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn*2)),
+            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
+            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            nSegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
+            totOccupancySegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
+            charge_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
+            superbin_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
+            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAccIn, maxPixelSegments)),
+            pixelType_buf(allocBufWrapper<int8_t>(devAccIn, maxPixelSegments)),
+            isQuad_buf(allocBufWrapper<char>(devAccIn, maxPixelSegments)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
             ptIn_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             ptErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             px_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
@@ -124,33 +137,41 @@ namespace SDL
             etaErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             eta_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             phi_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            superbin_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
-            pixelType_buf(allocBufWrapper<int8_t>(devAccIn, maxPixelSegments)),
-            isQuad_buf(allocBufWrapper<char>(devAccIn, maxPixelSegments)),
-            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
             score_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            charge_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
-            seedIdx_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelSegments)),
             circleCenterX_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             circleCenterY_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
-            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAccIn, maxPixelSegments)),
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1))
+            circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments))
         {
-            mdIndices = alpaka::getPtrNative(mdIndices_buf);
-            innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf);
-            outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf);
-            innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf);
-            outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf);
-            nSegments = alpaka::getPtrNative(nSegments_buf);
-            totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf);
             dPhis = alpaka::getPtrNative(dPhis_buf);
             dPhiMins = alpaka::getPtrNative(dPhiMins_buf);
             dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf);
             dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf);
             dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf);
             dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf);
+
+            innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf);
+            outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf);
+
+            seedIdx = alpaka::getPtrNative(seedIdx_buf);
+            mdIndices = alpaka::getPtrNative(mdIndices_buf);
+            nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf);
+            innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf);
+            outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf);
+
+            charge = alpaka::getPtrNative(charge_buf);
+            superbin = alpaka::getPtrNative(superbin_buf);
+            nSegments = alpaka::getPtrNative(nSegments_buf);
+            totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf);
+
+            pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf);
+
+            pixelType = alpaka::getPtrNative(pixelType_buf);
+
+            isQuad = alpaka::getPtrNative(isQuad_buf);
+
+            isDup = alpaka::getPtrNative(isDup_buf);
+            partOfPT5 = alpaka::getPtrNative(partOfPT5_buf);
+
             ptIn = alpaka::getPtrNative(ptIn_buf);
             ptErr = alpaka::getPtrNative(ptErr_buf);
             px = alpaka::getPtrNative(px_buf);
@@ -159,19 +180,10 @@ namespace SDL
             etaErr = alpaka::getPtrNative(etaErr_buf);
             eta = alpaka::getPtrNative(eta_buf);
             phi = alpaka::getPtrNative(phi_buf);
-            superbin = alpaka::getPtrNative(superbin_buf);
-            pixelType = alpaka::getPtrNative(pixelType_buf);
-            isQuad = alpaka::getPtrNative(isQuad_buf);
-            isDup = alpaka::getPtrNative(isDup_buf);
             score = alpaka::getPtrNative(score_buf);
-            charge = alpaka::getPtrNative(charge_buf);
-            seedIdx = alpaka::getPtrNative(seedIdx_buf);
             circleCenterX = alpaka::getPtrNative(circleCenterX_buf);
             circleCenterY = alpaka::getPtrNative(circleCenterY_buf);
             circleRadius = alpaka::getPtrNative(circleRadius_buf);
-            partOfPT5 = alpaka::getPtrNative(partOfPT5_buf);
-            pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf);
-            nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf);
 
             alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1);
             alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1);

From d7d466ed38907a70d89a1c5747e33885f7b00b2d Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sun, 4 Jun 2023 17:57:26 -0400
Subject: [PATCH 15/44] first working hits.cu to alpaka memory

---
 SDL/Event.cu                  | 293 ++++++----------------------------
 SDL/Event.cuh                 |   8 +-
 SDL/Hit.cu                    | 149 -----------------
 SDL/Hit.cuh                   | 222 +++++++++++++++++++++++---
 SDL/LST.cc                    |   2 +-
 SDL/MiniDoublet.cuh           |  11 +-
 SDL/Segment.cuh               |  52 +++++-
 SDL/TrackCandidate.cuh        |   4 +-
 SDL/Triplet.cuh               |   3 -
 code/core/AccessHelper.cc     |  16 +-
 code/core/write_sdl_ntuple.cc |  22 ++-
 11 files changed, 332 insertions(+), 450 deletions(-)
 delete mode 100644 SDL/Hit.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 694399d5..cb8e4747 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -61,7 +61,6 @@ SDL::Event::~Event()
 {
 #ifdef CACHE_ALLOC
     if(rangesInGPU){rangesInGPU->freeMemoryCache();}
-    if(hitsInGPU){hitsInGPU->freeMemoryCache();}
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
@@ -70,7 +69,6 @@ SDL::Event::~Event()
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(rangesInGPU){rangesInGPU->freeMemory();}
-    if(hitsInGPU){hitsInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
@@ -83,19 +81,13 @@ SDL::Event::~Event()
     if(segmentsInGPU != nullptr){delete segmentsInGPU;}
     if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);}
     if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);}
-    if(hitsInGPU!= nullptr){cms::cuda::free_host(hitsInGPU);}
+    if(hitsInGPU!= nullptr){delete hitsInGPU;}
     if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
     if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);}
     if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);}
 
     if(hitsInCPU != nullptr)
     {
-        delete[] hitsInCPU->idxs;
-        delete[] hitsInCPU->xs;
-        delete[] hitsInCPU->ys;
-        delete[] hitsInCPU->zs;
-        delete[] hitsInCPU->moduleIndices;
-        delete hitsInCPU->nHits;
         delete hitsInCPU;
     }
     if(rangesInCPU != nullptr)
@@ -200,7 +192,6 @@ SDL::Event::~Event()
         delete trackCandidatesInCPU;
     }
 
-
     if(modulesInCPU != nullptr)
     {
         delete[] modulesInCPU->nLowerModules;
@@ -247,7 +238,6 @@ SDL::Event::~Event()
 void SDL::Event::resetEvent()
 {
 #ifdef CACHE_ALLOC
-    if(hitsInGPU){hitsInGPU->freeMemoryCache();}
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(rangesInGPU){rangesInGPU->freeMemoryCache();}
@@ -256,7 +246,6 @@ void SDL::Event::resetEvent()
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
-    if(hitsInGPU){hitsInGPU->freeMemory();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(rangesInGPU){rangesInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
@@ -284,7 +273,7 @@ void SDL::Event::resetEvent()
             n_quintuplets_by_layer_endcap_[i] = 0;
         }
     }
-    if(hitsInGPU){cms::cuda::free_host(hitsInGPU);
+    if(hitsInGPU){delete hitsInGPU;
       hitsInGPU = nullptr;}
     if(mdsInGPU){cms::cuda::free_host(mdsInGPU);
       mdsInGPU = nullptr;}
@@ -305,12 +294,6 @@ void SDL::Event::resetEvent()
 
     if(hitsInCPU != nullptr)
     {
-        delete[] hitsInCPU->idxs;
-        delete[] hitsInCPU->xs;
-        delete[] hitsInCPU->ys;
-        delete[] hitsInCPU->zs;
-        delete[] hitsInCPU->moduleIndices;
-        delete hitsInCPU->nHits;
         delete hitsInCPU;
         hitsInCPU = nullptr;
     }
@@ -321,7 +304,6 @@ void SDL::Event::resetEvent()
         delete rangesInCPU;
         rangesInCPU = nullptr;
     }
-
     if(mdsInCPU != nullptr)
     {
         delete[] mdsInCPU->anchorHitIndices;
@@ -330,13 +312,11 @@ void SDL::Event::resetEvent()
         delete mdsInCPU;
         mdsInCPU = nullptr;
     }
-
     if(segmentsInCPU != nullptr)
     {
         delete segmentsInCPU;
         segmentsInCPU = nullptr;
     }
-
     if(tripletsInCPU != nullptr)
     {
         delete[] tripletsInCPU->segmentIndices;
@@ -381,7 +361,6 @@ void SDL::Event::resetEvent()
         delete pixelTripletsInCPU;
         pixelTripletsInCPU = nullptr;
     }
-
     if(pixelQuintupletsInCPU != nullptr)
     {
         delete[] pixelQuintupletsInCPU->pixelIndices;
@@ -407,7 +386,6 @@ void SDL::Event::resetEvent()
         delete trackCandidatesInCPU;
         trackCandidatesInCPU = nullptr;
     }
-
     if(modulesInCPU != nullptr)
     {
         delete[] modulesInCPU->nLowerModules;
@@ -444,14 +422,11 @@ void SDL::Event::resetEvent()
         delete[] modulesInCPUFull->r;
         delete[] modulesInCPUFull->isInverted;
         delete[] modulesInCPUFull->isLower;
-
-
         delete[] modulesInCPUFull->moduleType;
         delete[] modulesInCPUFull->moduleLayerType;
         delete[] modulesInCPUFull;
         modulesInCPUFull = nullptr;
     }
-
 }
 
 void SDL::initModules(const char* moduleMetaDataFilePath)
@@ -480,154 +455,39 @@ void SDL::Event::resetObjectsInModule()
     resetObjectRanges(*rangesInGPU,nModules,stream);
 }
 
-ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search(
-    unsigned int *data, // Array that we are searching over
-    unsigned int search_val, // Value we want to find in data array
-    unsigned int ndata) // Number of elements in data array
-{
-    unsigned int low = 0;
-    unsigned int high = ndata - 1;
-
-    while(low <= high)
-    {
-        unsigned int mid = (low + high)/2;
-        unsigned int test_val = data[mid];
-        if (test_val == search_val)
-            return mid;
-        else if (test_val > search_val)
-            high = mid - 1;
-        else
-            low = mid + 1;
-    }
-    // Couldn't find search value in array.
-    return -1;
-}
-
-struct moduleRangesKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc>
-    ALPAKA_FN_ACC void operator()(
-        TAcc const & acc,
-        struct SDL::modules *modulesInGPU,
-        struct SDL::hits *hitsInGPU,
-        int const & nLowerModules) const
-    {
-        using Dim = alpaka::Dim<TAcc>;
-        using Idx = alpaka::Idx<TAcc>;
-        using Vec = alpaka::Vec<Dim, Idx>;
-
-        Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2])
-        {
-            uint16_t upperIndex = modulesInGPU->partnerModuleIndices[lowerIndex];
-            if (hitsInGPU->hitRanges[lowerIndex * 2] != -1 && hitsInGPU->hitRanges[upperIndex * 2] != -1)
-            {
-                hitsInGPU->hitRangesLower[lowerIndex] =  hitsInGPU->hitRanges[lowerIndex * 2]; 
-                hitsInGPU->hitRangesUpper[lowerIndex] =  hitsInGPU->hitRanges[upperIndex * 2];
-                hitsInGPU->hitRangesnLower[lowerIndex] = hitsInGPU->hitRanges[lowerIndex * 2 + 1] - hitsInGPU->hitRanges[lowerIndex * 2] + 1;
-                hitsInGPU->hitRangesnUpper[lowerIndex] = hitsInGPU->hitRanges[upperIndex * 2 + 1] - hitsInGPU->hitRanges[upperIndex * 2] + 1;
-            }
-        }
-    }
-};
-
-struct hitLoopKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc>
-    ALPAKA_FN_ACC void operator()(
-        TAcc const & acc,
-        uint16_t Endcap, // Integer corresponding to endcap in module subdets
-        uint16_t TwoS, // Integer corresponding to TwoS in moduleType
-        unsigned int nModules, // Number of modules
-        unsigned int nEndCapMap, // Number of elements in endcap map
-        unsigned int* geoMapDetId, // DetId's from endcap map
-        float* geoMapPhi, // Phi values from endcap map
-        struct SDL::modules *modulesInGPU,
-        struct SDL::hits *hitsInGPU,
-        int const & nHits) const // Total number of hits in event
-    {
-        using Dim = alpaka::Dim<TAcc>;
-        using Idx = alpaka::Idx<TAcc>;
-        using Vec = alpaka::Vec<Dim, Idx>;
-
-        Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2])
-        {
-            float ihit_x = hitsInGPU->xs[ihit];
-            float ihit_y = hitsInGPU->ys[ihit];
-            float ihit_z = hitsInGPU->zs[ihit];
-            int iDetId = hitsInGPU->detid[ihit];
-    
-            hitsInGPU->rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y);
-            hitsInGPU->phis[ihit] = SDL::phi(acc, ihit_x,ihit_y);
-            // Acosh has no supported implementation in Alpaka right now.
-            hitsInGPU->etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU->rts[ihit]);
-            int found_index = binary_search(modulesInGPU->mapdetId, iDetId, nModules);
-            uint16_t lastModuleIndex = modulesInGPU->mapIdx[found_index];
-    
-            hitsInGPU->moduleIndices[ihit] = lastModuleIndex;
-    
-            if(modulesInGPU->subdets[lastModuleIndex] == Endcap && modulesInGPU->moduleType[lastModuleIndex] == TwoS)
-            {
-                int found_index = binary_search(geoMapDetId, iDetId, nEndCapMap);
-                float phi = 0;
-                // Unclear why these are not in map, but CPU map returns phi = 0 for all exceptions.
-                if (found_index != -1)
-                    phi = geoMapPhi[found_index];
-                float cos_phi = alpaka::math::cos(acc, phi);
-                hitsInGPU->highEdgeXs[ihit] = ihit_x + 2.5f * cos_phi;
-                hitsInGPU->lowEdgeXs[ihit] = ihit_x - 2.5f * cos_phi;
-                float sin_phi = alpaka::math::sin(acc, phi);
-                hitsInGPU->highEdgeYs[ihit] = ihit_y + 2.5f * sin_phi;
-                hitsInGPU->lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi;
-            }
-            // Need to set initial value if index hasn't been seen before.
-            int old = alpaka::atomicOp<alpaka::AtomicCas>(acc, &(hitsInGPU->hitRanges[lastModuleIndex * 2]), -1, ihit);
-            // For subsequent visits, stores the min value.
-            if (old != -1)
-                alpaka::atomicOp<alpaka::AtomicMin>(acc, &hitsInGPU->hitRanges[lastModuleIndex * 2], ihit);
-
-            alpaka::atomicOp<alpaka::AtomicMax>(acc, &hitsInGPU->hitRanges[lastModuleIndex * 2 + 1], ihit);
-        }
-    }
-};
-
 void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple)
 {
     // Use the actual number of hits instead of a max.
     const int nHits = x.size();
 
+    // Needed for the memcpy to hitsInGPU below.
+    auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+    *alpaka::getPtrNative(nHits_buf) = nHits;
+
     // Get current device for future use.
     cudaGetDevice(&dev);
 
     // Initialize space on device/host for next event.
     if (hitsInGPU == nullptr)
     {
-        hitsInGPU = (SDL::hits*)cms::cuda::allocate_host(sizeof(SDL::hits), stream);
-        // Unclear why but this has to be 2*nHits to avoid crashing.
-        createHitsInExplicitMemory(*hitsInGPU, nModules, 2*nHits, stream, 1);
+        hitsInGPU = new SDL::hits<Acc>(nModules, nHits, devAcc, queue);
     }
+
     if (rangesInGPU == nullptr)
     {
         rangesInGPU = (SDL::objectRanges*)cms::cuda::allocate_host(sizeof(SDL::objectRanges), stream);
-    	createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules);
+        createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules);
         resetObjectsInModule();
     }
-    cudaStreamSynchronize(stream);
+
     // Copy the host arrays to the GPU.
-    cudaMemcpyAsync(hitsInGPU->xs, &x[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitsInGPU->ys, &y[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitsInGPU->zs, &z[0], nHits*sizeof(float), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitsInGPU->detid, &detId[0], nHits*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitsInGPU->idxs, &idxInNtuple[0], nHits*sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(hitsInGPU->nHits, &nHits, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaStreamSynchronize(stream);
+    alpaka::memcpy(queue, hitsInGPU->xs_buf, x, nHits);
+    alpaka::memcpy(queue, hitsInGPU->ys_buf, y, nHits);
+    alpaka::memcpy(queue, hitsInGPU->zs_buf, z, nHits);
+    alpaka::memcpy(queue, hitsInGPU->detid_buf, detId, nHits);
+    alpaka::memcpy(queue, hitsInGPU->idxs_buf, idxInNtuple, nHits);
+    alpaka::memcpy(queue, hitsInGPU->nHits_buf, nHits_buf, 1);
+    alpaka::wait(queue);
 
     Vec const threadsPerBlock1(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
     Vec const blocksPerGrid1(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS));
@@ -643,12 +503,11 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
         SDL::endcapGeometry.nEndCapMap,
         SDL::endcapGeometry.geoMapDetId,
         SDL::endcapGeometry.geoMapPhi,
-        modulesInGPU,
-        hitsInGPU,
+        *modulesInGPU,
+        *hitsInGPU,
         nHits));
 
     alpaka::enqueue(queue, hit_loop_task);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlock2(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
     Vec const blocksPerGrid2(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS));
@@ -658,8 +517,8 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     auto const module_ranges_task(alpaka::createTaskKernel<Acc>(
         module_ranges_workdiv,
         module_ranges_kernel,
-        modulesInGPU,
-        hitsInGPU,
+        *modulesInGPU,
+        *hitsInGPU,
         nLowerModules));
 
     // Waiting isn't needed after second kernel call. Saves ~100 us.
@@ -668,57 +527,6 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     alpaka::enqueue(queue, module_ranges_task);
 }
 
-struct addPixelSegmentToEventKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc>
-    ALPAKA_FN_ACC void operator()(
-        TAcc const & acc,
-        struct SDL::modules& modulesInGPU,
-        struct SDL::objectRanges& rangesInGPU,
-        struct SDL::hits& hitsInGPU,
-        struct SDL::miniDoublets& mdsInGPU,
-        struct SDL::segments<TAcc>& segmentsInGPU,
-        unsigned int* hitIndices0,
-        unsigned int* hitIndices1,
-        unsigned int* hitIndices2,
-        unsigned int* hitIndices3,
-        float* dPhiChange,
-        uint16_t pixelModuleIndex,
-        const int size) const
-    {
-        using Dim = alpaka::Dim<TAcc>;
-        using Idx = alpaka::Idx<TAcc>;
-        using Vec = alpaka::Vec<Dim, Idx>;
-
-        Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2])
-        {
-            unsigned int innerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid);
-            unsigned int outerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid) +1;
-            unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + tid;
-
-            addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices0[tid], hitIndices1[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,innerMDIndex);
-            addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex);
-
-            //in outer hits - pt, eta, phi
-            float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
-            float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]];
-            float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]);
-            score_lsq = score_lsq * score_lsq;
-
-            unsigned int hits1[4];
-            hits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[innerMDIndex]];
-            hits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[outerMDIndex]];
-            hits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[innerMDIndex]];
-            hits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[outerMDIndex]];
-            addPixelSegmentToMemory(acc, segmentsInGPU, mdsInGPU, innerMDIndex, outerMDIndex, pixelModuleIndex, hits1, hitIndices0[tid], hitIndices2[tid], dPhiChange[tid], pixelSegmentIndex, tid, score_lsq);
-        }
-    }
-};
-
 void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,std::vector<unsigned int> hitIndices1,std::vector<unsigned int> hitIndices2,std::vector<unsigned int> hitIndices3, std::vector<float> dPhiChange, std::vector<float> ptIn, std::vector<float> ptErr, std::vector<float> px, std::vector<float> py, std::vector<float> pz, std::vector<float> eta, std::vector<float> etaErr, std::vector<float> phi, std::vector<int> charge, std::vector<unsigned int> seedIdx, std::vector<int> superbin, std::vector<int8_t> pixelType, std::vector<char> isQuad)
 {
     const int size = ptIn.size();
@@ -1016,7 +824,6 @@ void SDL::Event::createMiniDoublets()
     {
         addMiniDoubletsToEventExplicit();
     }
-
 }
 
 void SDL::Event::createSegmentsWithModuleMap()
@@ -1414,7 +1221,6 @@ void SDL::Event::createPixelTriplets()
     cms::cuda::free_device(dev, connectedPixelSize_dev);
     cms::cuda::free_device(dev, connectedPixelIndex_dev);
 
-
 #ifdef Warnings
     int nPixelTriplets;
     cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets,  sizeof(int), cudaMemcpyDeviceToHost,stream);
@@ -1916,6 +1722,7 @@ int SDL::Event::getNumberOfPixelQuintuplets()
     cudaStreamSynchronize(stream);
     return nPixelQuintuplets;
 }
+
 unsigned int SDL::Event::getNumberOfQuintuplets()
 {
     unsigned int quintuplets = 0;
@@ -2003,45 +1810,43 @@ int SDL::Event::getNumberOfT5TrackCandidates()
     return nTrackCandidatesT5; 
 }
 
-SDL::hits* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection
+SDL::hits<alpaka::DevCpu>* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection
 {
     if(hitsInCPU == nullptr)
     {
-        hitsInCPU = new SDL::hits;
-        hitsInCPU->nHits = new unsigned int;
-        unsigned int nHits;
-        cudaMemcpyAsync(&nHits, hitsInGPU->nHits, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        *(hitsInCPU->nHits) = nHits;
-        hitsInCPU->idxs = new unsigned int[nHits];
-        hitsInCPU->detid = new unsigned int[nHits];
-        hitsInCPU->xs = new float[nHits];
-        hitsInCPU->ys = new float[nHits];
-        hitsInCPU->zs = new float[nHits];
-        hitsInCPU->moduleIndices = new uint16_t[nHits];
-        cudaMemcpyAsync(hitsInCPU->idxs, hitsInGPU->idxs,sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(hitsInCPU->detid, hitsInGPU->detid, sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(hitsInCPU->xs, hitsInGPU->xs, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(hitsInCPU->ys, hitsInGPU->ys, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(hitsInCPU->zs, hitsInGPU->zs, sizeof(float) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(hitsInCPU->moduleIndices, hitsInGPU->moduleIndices, sizeof(uint16_t) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1);
+        alpaka::wait(queue);
+
+        unsigned int nHits = *alpaka::getPtrNative(nHits_buf);
+        hitsInCPU = new SDL::hits<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+
+        *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits;
+        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsInGPU->detid_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsInGPU->xs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsInGPU->ys_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsInGPU->zs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsInGPU->moduleIndices_buf, nHits);
+        alpaka::wait(queue);
     }
     return hitsInCPU;
 }
 
-SDL::hits* SDL::Event::getHitsInCMSSW()
+SDL::hits<alpaka::DevCpu>* SDL::Event::getHitsInCMSSW()
 {
     if(hitsInCPU == nullptr)
     {
-        hitsInCPU = new SDL::hits;
-        hitsInCPU->nHits = new unsigned int;
-        unsigned int nHits;
-        cudaMemcpyAsync(&nHits, hitsInGPU->nHits, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        hitsInCPU->idxs = new unsigned int[nHits];
-        cudaMemcpyAsync(hitsInCPU->idxs, hitsInGPU->idxs,sizeof(unsigned int) * nHits, cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1);
+        alpaka::wait(queue);
+
+        unsigned int nHits = *alpaka::getPtrNative(nHits_buf);
+        hitsInCPU = new SDL::hits<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+
+        *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits;
+        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits);
+        alpaka::wait(queue);
     }
     return hitsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index f5b8327c..4b431d33 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -40,7 +40,7 @@ namespace SDL
         int dev;
         int nTotalSegments;
         struct objectRanges* rangesInGPU;
-        struct hits* hitsInGPU;
+        struct hits<Acc>* hitsInGPU;
         struct miniDoublets* mdsInGPU;
         struct segments<Acc>* segmentsInGPU;
         struct triplets* tripletsInGPU;
@@ -51,7 +51,7 @@ namespace SDL
 
         //CPU interface stuff
         objectRanges* rangesInCPU;
-        hits* hitsInCPU;
+        hits<alpaka::DevCpu>* hitsInCPU;
         miniDoublets* mdsInCPU;
         segments<alpaka::DevCpu>* segmentsInCPU;
         triplets* tripletsInCPU;
@@ -130,8 +130,8 @@ namespace SDL
         unsigned int getNumberOfT3T3ExtendedTracks();
 
         objectRanges* getRanges();
-        hits* getHits();
-        hits* getHitsInCMSSW();
+        hits<alpaka::DevCpu>* getHits();
+        hits<alpaka::DevCpu>* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
         segments<alpaka::DevCpu>* getSegments() ;
         triplets* getTriplets();
diff --git a/SDL/Hit.cu b/SDL/Hit.cu
deleted file mode 100644
index 79f887e0..00000000
--- a/SDL/Hit.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-# include "Hit.cuh"
-
-SDL::hits::hits()
-{
-    nHits = nullptr;
-    xs = nullptr;
-    ys = nullptr;
-    zs = nullptr;
-    moduleIndices = nullptr;
-    detid = nullptr;
-    rts = nullptr;
-    phis = nullptr;
-    etas = nullptr;
-    highEdgeXs = nullptr;
-    highEdgeYs = nullptr;
-    lowEdgeXs = nullptr;
-    lowEdgeYs = nullptr;
-    hitRanges = nullptr;
-    hitRangesLower = nullptr;
-    hitRangesUpper = nullptr;
-    hitRangesnLower = nullptr;
-    hitRangesnUpper = nullptr;
-}
-
-SDL::hits::~hits()
-{
-}
-
-void SDL::createHitsInExplicitMemory(struct hits& hitsInGPU, int nModules, unsigned int nMaxHits,cudaStream_t stream,unsigned int evtnum)
-{
-#if defined(CACHE_ALLOC)
-    int dev;
-    cudaGetDevice(&dev);
-    hitsInGPU.xs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.ys = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.zs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-
-    hitsInGPU.rts = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.phis = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.etas = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-
-    hitsInGPU.moduleIndices = (uint16_t*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(uint16_t),stream);
-    hitsInGPU.idxs = (unsigned int*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(unsigned int),stream);
-    hitsInGPU.detid = (unsigned int*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(unsigned int),stream);
-
-    hitsInGPU.highEdgeXs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.highEdgeYs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.lowEdgeXs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-    hitsInGPU.lowEdgeYs = (float*)cms::cuda::allocate_device(dev,nMaxHits*sizeof(float),stream);
-
-    hitsInGPU.nHits = (unsigned int*)cms::cuda::allocate_device(dev,evtnum*sizeof(unsigned int),stream);
-
-    hitsInGPU.hitRanges =                  (int*)cms::cuda::allocate_device(dev,         evtnum*nModules * 2 * sizeof(int),stream);
-    hitsInGPU.hitRangesLower =                  (int*)cms::cuda::allocate_device(dev,    evtnum*nModules * sizeof(int),stream);
-    hitsInGPU.hitRangesUpper =                  (int*)cms::cuda::allocate_device(dev,    evtnum*nModules * sizeof(int),stream);
-    hitsInGPU.hitRangesnLower =                  (int8_t*)cms::cuda::allocate_device(dev,evtnum*nModules * sizeof(int8_t),stream);
-    hitsInGPU.hitRangesnUpper =                  (int8_t*)cms::cuda::allocate_device(dev,evtnum*nModules * sizeof(int8_t),stream);
-#else
-    cudaMalloc(&hitsInGPU.xs, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.ys, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.zs, nMaxHits * sizeof(float));
-
-    cudaMalloc(&hitsInGPU.moduleIndices, nMaxHits * sizeof(uint16_t));
-    cudaMalloc(&hitsInGPU.idxs, nMaxHits * sizeof(unsigned int));
-    cudaMalloc(&hitsInGPU.detid, nMaxHits * sizeof(unsigned int));
-
-    cudaMalloc(&hitsInGPU.rts, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.phis, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.etas, nMaxHits * sizeof(float));
-
-    cudaMalloc(&hitsInGPU.highEdgeXs, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.highEdgeYs, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.lowEdgeXs, nMaxHits * sizeof(float));
-    cudaMalloc(&hitsInGPU.lowEdgeYs, nMaxHits * sizeof(float));
-
-    //counters
-    cudaMalloc(&hitsInGPU.nHits,evtnum* sizeof(unsigned int));
-
-    cudaMalloc(&hitsInGPU.hitRanges,evtnum*nModules * 2 * sizeof(int));
-    cudaMalloc(&hitsInGPU.hitRangesLower,evtnum*nModules  * sizeof(int));
-    cudaMalloc(&hitsInGPU.hitRangesUpper,evtnum*nModules  * sizeof(int));
-    cudaMalloc(&hitsInGPU.hitRangesnLower,evtnum*nModules  * sizeof(int8_t));
-    cudaMalloc(&hitsInGPU.hitRangesnUpper,evtnum* nModules  * sizeof(int8_t));
-#endif
-    cudaMemsetAsync(hitsInGPU.nHits,0,evtnum*sizeof(unsigned int),stream);
-    cudaMemsetAsync(hitsInGPU.hitRanges, -1,      evtnum*nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(hitsInGPU.hitRangesLower, -1, evtnum*nModules*sizeof(int),stream);
-    cudaMemsetAsync(hitsInGPU.hitRangesUpper, -1, evtnum*nModules*sizeof(int),stream);
-    cudaMemsetAsync(hitsInGPU.hitRangesnLower, -1,evtnum*nModules*sizeof(int8_t),stream);
-    cudaMemsetAsync(hitsInGPU.hitRangesnUpper, -1,evtnum*nModules*sizeof(int8_t),stream);
-    cudaStreamSynchronize(stream);
-}
-
-void SDL::printHit(struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int hitIndex)
-{
-    std::cout << "Hit(x=" << hitsInGPU.xs[hitIndex] << ", y=" << hitsInGPU.ys[hitIndex] << ", z=" << hitsInGPU.zs[hitIndex] << ", rt=" << hitsInGPU.rts[hitIndex] << ", phi=" << hitsInGPU.phis[hitIndex] <<", module subdet = "<<modulesInGPU.subdets[hitsInGPU.moduleIndices[hitIndex]]<<", module layer = "<< modulesInGPU.layers[hitsInGPU.moduleIndices[hitIndex]]<<", module ring = "<< modulesInGPU.rings[hitsInGPU.moduleIndices[hitIndex]]<<" )"<<std::endl;
-}
-
-void SDL::hits::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,nHits);
-    cms::cuda::free_device(dev,xs);
-    cms::cuda::free_device(dev,ys);
-    cms::cuda::free_device(dev,zs);
-    cms::cuda::free_device(dev,moduleIndices);
-    cms::cuda::free_device(dev,rts);
-    cms::cuda::free_device(dev,idxs);
-    cms::cuda::free_device(dev,detid);
-    cms::cuda::free_device(dev,phis);
-    cms::cuda::free_device(dev,etas);
-
-    cms::cuda::free_device(dev,highEdgeXs);
-    cms::cuda::free_device(dev,highEdgeYs);
-    cms::cuda::free_device(dev,lowEdgeXs);
-    cms::cuda::free_device(dev,lowEdgeYs);
-    
-    cms::cuda::free_device(dev,hitRanges);
-    cms::cuda::free_device(dev,hitRangesLower);
-    cms::cuda::free_device(dev,hitRangesnLower);
-    cms::cuda::free_device(dev,hitRangesUpper);
-    cms::cuda::free_device(dev,hitRangesnUpper);
-}
-
-void SDL::hits::freeMemory()
-{
-    cudaFree(nHits);
-    cudaFree(xs);
-    cudaFree(ys);
-    cudaFree(zs);
-    cudaFree(moduleIndices);
-    cudaFree(rts);
-    cudaFree(idxs);
-    cudaFree(detid);
-    cudaFree(phis);
-    cudaFree(etas);
-
-    cudaFree(highEdgeXs);
-    cudaFree(highEdgeYs);
-    cudaFree(lowEdgeXs);
-    cudaFree(lowEdgeYs);
-    
-    cudaFree(hitRanges);
-    cudaFree(hitRangesLower);
-    cudaFree(hitRangesnLower);
-    cudaFree(hitRangesUpper);
-    cudaFree(hitRangesnUpper);
-}
\ No newline at end of file
diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index 12fc7bf2..a2502d11 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -1,48 +1,107 @@
 #ifndef Hit_cuh
 #define Hit_cuh
 
-#include <iostream>
-
 #include "Constants.cuh"
 #include "Module.cuh"
-#include "allocate.h"
 
 namespace SDL
 {
+    template<typename TAcc>
     struct hits
     {
-        unsigned int *nHits; //single number
-        float *xs;
-        float *ys;
-        float *zs;
+        Buf<TAcc, unsigned int> nHits_buf;
+        Buf<TAcc, float> xs_buf;
+        Buf<TAcc, float> ys_buf;
+        Buf<TAcc, float> zs_buf;
+        Buf<TAcc, uint16_t> moduleIndices_buf;
+        Buf<TAcc, unsigned int> idxs_buf;
+        Buf<TAcc, unsigned int> detid_buf;
+        Buf<TAcc, float> rts_buf;
+        Buf<TAcc, float> phis_buf;
+        Buf<TAcc, float> etas_buf;
+        Buf<TAcc, float> highEdgeXs_buf;
+        Buf<TAcc, float> highEdgeYs_buf;
+        Buf<TAcc, float> lowEdgeXs_buf;
+        Buf<TAcc, float> lowEdgeYs_buf;
+        Buf<TAcc, int> hitRanges_buf;
+        Buf<TAcc, int> hitRangesLower_buf;
+        Buf<TAcc, int> hitRangesUpper_buf;
+        Buf<TAcc, int8_t> hitRangesnLower_buf;
+        Buf<TAcc, int8_t> hitRangesnUpper_buf;
 
+        unsigned int* nHits;
+        float* xs;
+        float* ys;
+        float* zs;
         uint16_t* moduleIndices;
         unsigned int* idxs;
         unsigned int* detid;
-        
-        float *rts;
+        float* rts;
         float* phis;
         float* etas;
-
-        float *highEdgeXs;
-        float *highEdgeYs;
-        float *lowEdgeXs;
-        float *lowEdgeYs;
-
+        float* highEdgeXs;
+        float* highEdgeYs;
+        float* lowEdgeXs;
+        float* lowEdgeYs;
         int* hitRanges;
         int* hitRangesLower;
         int* hitRangesUpper;
         int8_t* hitRangesnLower;
         int8_t* hitRangesnUpper;
-        
-        hits();
-        void freeMemory();
-        void freeMemoryCache();
-        ~hits();
-    };
 
-    void printHit(struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int hitIndex);
-    void createHitsInExplicitMemory(struct hits& hitsInGPU, int nModules, unsigned int maxHits,cudaStream_t stream,unsigned int evtnum);
+        template<typename TQueue, typename TDevAcc>
+        hits(unsigned int nModules,
+             unsigned int nMaxHits,
+             TDevAcc const & devAccIn,
+             TQueue& queue) :
+            nHits_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            xs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            ys_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            zs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMaxHits)),
+            idxs_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits)),
+            detid_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits)),
+            rts_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            phis_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            etas_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            highEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            highEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            lowEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            lowEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
+            hitRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nModules)),
+            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nModules)),
+            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
+            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules))
+        {
+            nHits = alpaka::getPtrNative(nHits_buf);
+            xs = alpaka::getPtrNative(xs_buf);
+            ys = alpaka::getPtrNative(ys_buf);
+            zs = alpaka::getPtrNative(zs_buf);
+            moduleIndices = alpaka::getPtrNative(moduleIndices_buf);
+            idxs = alpaka::getPtrNative(idxs_buf);
+            detid = alpaka::getPtrNative(detid_buf);
+            rts = alpaka::getPtrNative(rts_buf);
+            phis = alpaka::getPtrNative(phis_buf);
+            etas = alpaka::getPtrNative(etas_buf);
+            highEdgeXs = alpaka::getPtrNative(highEdgeXs_buf);
+            highEdgeYs = alpaka::getPtrNative(highEdgeYs_buf);
+            lowEdgeXs = alpaka::getPtrNative(lowEdgeXs_buf);
+            lowEdgeYs = alpaka::getPtrNative(lowEdgeYs_buf);
+            hitRanges = alpaka::getPtrNative(hitRanges_buf);
+            hitRangesLower = alpaka::getPtrNative(hitRangesLower_buf);
+            hitRangesUpper = alpaka::getPtrNative(hitRangesUpper_buf);
+            hitRangesnLower = alpaka::getPtrNative(hitRangesnLower_buf);
+            hitRangesnUpper = alpaka::getPtrNative(hitRangesnUpper_buf);
+
+            alpaka::memset(queue, hitRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, hitRangesLower_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesUpper_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesnLower_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules);
+            alpaka::wait(queue);
+        }
+    };
 
     // Hyperbolic functions were just merged into Alpaka early 2023,
     // so we have to make use of temporary functions for now.
@@ -123,5 +182,122 @@ namespace SDL
 
         return dPhi;
     };
+
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search(
+        unsigned int *data, // Array that we are searching over
+        unsigned int search_val, // Value we want to find in data array
+        unsigned int ndata) // Number of elements in data array
+    {
+        unsigned int low = 0;
+        unsigned int high = ndata - 1;
+
+        while(low <= high)
+        {
+            unsigned int mid = (low + high)/2;
+            unsigned int test_val = data[mid];
+            if (test_val == search_val)
+                return mid;
+            else if (test_val > search_val)
+                high = mid - 1;
+            else
+                low = mid + 1;
+        }
+        // Couldn't find search value in array.
+        return -1;
+    };
+
+    struct moduleRangesKernel
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc>
+        ALPAKA_FN_ACC void operator()(
+            TAcc const & acc,
+            struct SDL::modules& modulesInGPU,
+            struct SDL::hits<TAcc>& hitsInGPU,
+            int const & nLowerModules) const
+        {
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using Vec = alpaka::Vec<Dim, Idx>;
+
+            Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+            Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+            for(int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2])
+            {
+                uint16_t upperIndex = modulesInGPU.partnerModuleIndices[lowerIndex];
+                if (hitsInGPU.hitRanges[lowerIndex * 2] != -1 && hitsInGPU.hitRanges[upperIndex * 2] != -1)
+                {
+                    hitsInGPU.hitRangesLower[lowerIndex] =  hitsInGPU.hitRanges[lowerIndex * 2]; 
+                    hitsInGPU.hitRangesUpper[lowerIndex] =  hitsInGPU.hitRanges[upperIndex * 2];
+                    hitsInGPU.hitRangesnLower[lowerIndex] = hitsInGPU.hitRanges[lowerIndex * 2 + 1] - hitsInGPU.hitRanges[lowerIndex * 2] + 1;
+                    hitsInGPU.hitRangesnUpper[lowerIndex] = hitsInGPU.hitRanges[upperIndex * 2 + 1] - hitsInGPU.hitRanges[upperIndex * 2] + 1;
+                }
+            }
+        }
+    };
+
+    struct hitLoopKernel
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc>
+        ALPAKA_FN_ACC void operator()(
+            TAcc const & acc,
+            uint16_t Endcap, // Integer corresponding to endcap in module subdets
+            uint16_t TwoS, // Integer corresponding to TwoS in moduleType
+            unsigned int nModules, // Number of modules
+            unsigned int nEndCapMap, // Number of elements in endcap map
+            unsigned int* geoMapDetId, // DetId's from endcap map
+            float* geoMapPhi, // Phi values from endcap map
+            struct SDL::modules& modulesInGPU,
+            struct SDL::hits<TAcc>& hitsInGPU,
+            unsigned int const & nHits) const // Total number of hits in event
+        {
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using Vec = alpaka::Vec<Dim, Idx>;
+
+            Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+            Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+            for(int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2])
+            {
+                float ihit_x = hitsInGPU.xs[ihit];
+                float ihit_y = hitsInGPU.ys[ihit];
+                float ihit_z = hitsInGPU.zs[ihit];
+                int iDetId = hitsInGPU.detid[ihit];
+
+                hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y);
+                hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y);
+                // Acosh has no supported implementation in Alpaka right now.
+                hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
+                int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules);
+                uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index];
+
+                hitsInGPU.moduleIndices[ihit] = lastModuleIndex;
+
+                if(modulesInGPU.subdets[lastModuleIndex] == Endcap && modulesInGPU.moduleType[lastModuleIndex] == TwoS)
+                {
+                    found_index = binary_search(geoMapDetId, iDetId, nEndCapMap);
+                    float phi = 0;
+                    // Unclear why these are not in map, but CPU map returns phi = 0 for all exceptions.
+                    if (found_index != -1)
+                        phi = geoMapPhi[found_index];
+                    float cos_phi = alpaka::math::cos(acc, phi);
+                    hitsInGPU.highEdgeXs[ihit] = ihit_x + 2.5f * cos_phi;
+                    hitsInGPU.lowEdgeXs[ihit] = ihit_x - 2.5f * cos_phi;
+                    float sin_phi = alpaka::math::sin(acc, phi);
+                    hitsInGPU.highEdgeYs[ihit] = ihit_y + 2.5f * sin_phi;
+                    hitsInGPU.lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi;
+                }
+                // Need to set initial value if index hasn't been seen before.
+                int old = alpaka::atomicOp<alpaka::AtomicCas>(acc, &(hitsInGPU.hitRanges[lastModuleIndex * 2]), -1, ihit);
+                // For subsequent visits, stores the min value.
+                if (old != -1)
+                    alpaka::atomicOp<alpaka::AtomicMin>(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2], ihit);
+
+                alpaka::atomicOp<alpaka::AtomicMax>(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2 + 1], ihit);
+            }
+        }
+    };
 }
 #endif
\ No newline at end of file
diff --git a/SDL/LST.cc b/SDL/LST.cc
index 23253a95..83481428 100644
--- a/SDL/LST.cc
+++ b/SDL/LST.cc
@@ -403,7 +403,7 @@ void SDL::LST::getOutput(SDL::Event& event) {
     std::vector<int> tc_seedIdx_;
     std::vector<short> tc_trackCandidateType_;
 
-    SDL::hits& hitsInGPU = (*event.getHitsInCMSSW());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event.getHitsInCMSSW());
     SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW());
 
     unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates;
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 4f136336..cfc81f55 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -68,7 +68,8 @@ namespace SDL
 
     void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
+    template<typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits<TAcc>& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
     {
         //the index into which this MD needs to be written will be computed in the kernel
         //nMDs variable will be incremented in the kernel, no need to worry about that here
@@ -659,7 +660,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
-                struct SDL::hits& hitsInGPU,
+                struct SDL::hits<TAcc>& hitsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
@@ -676,6 +677,10 @@ namespace SDL
                 int nLowerHits = hitsInGPU.hitRangesnLower[lowerModuleIndex];
                 int nUpperHits = hitsInGPU.hitRangesnUpper[lowerModuleIndex];
                 if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) continue;
+                if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1)
+                {
+                    printf("IS THIS EVER RUN");
+                }
                 const int maxHits = alpaka::math::max(acc, nUpperHits, nLowerHits);
                 unsigned int upHitArrayIndex = hitsInGPU.hitRangesUpper[lowerModuleIndex];
                 unsigned int loHitArrayIndex = hitsInGPU.hitRangesLower[lowerModuleIndex];
@@ -799,7 +804,7 @@ namespace SDL
                 struct modules& modulesInGPU,
                 struct miniDoublets& mdsInGPU,
                 struct objectRanges& rangesInGPU,
-                struct hits& hitsInGPU) const
+                struct SDL::hits<TAcc>& hitsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 065bee14..85a1df54 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -189,7 +189,6 @@ namespace SDL
             alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1);
             alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments);
             alpaka::memset(queue, pLSHitsIdxs_buf, 0u, maxPixelSegments);
-            alpaka::memset(queue, nMemoryLocations_buf, nMemoryLocationsIn, 1);
             alpaka::wait(queue);
         }
     };
@@ -882,6 +881,57 @@ namespace SDL
             }
         }
     };
+
+    struct addPixelSegmentToEventKernel
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc>
+        ALPAKA_FN_ACC void operator()(
+            TAcc const & acc,
+            struct SDL::modules& modulesInGPU,
+            struct SDL::objectRanges& rangesInGPU,
+            struct SDL::hits<TAcc>& hitsInGPU,
+            struct SDL::miniDoublets& mdsInGPU,
+            struct SDL::segments<TAcc>& segmentsInGPU,
+            unsigned int* hitIndices0,
+            unsigned int* hitIndices1,
+            unsigned int* hitIndices2,
+            unsigned int* hitIndices3,
+            float* dPhiChange,
+            uint16_t pixelModuleIndex,
+            const int size) const
+        {
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using Vec = alpaka::Vec<Dim, Idx>;
+    
+            Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+            Vec const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+    
+            for(int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2])
+            {
+                unsigned int innerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid);
+                unsigned int outerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2*(tid) +1;
+                unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + tid;
+    
+                addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices0[tid], hitIndices1[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,innerMDIndex);
+                addMDToMemory(mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex);
+    
+                //in outer hits - pt, eta, phi
+                float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
+                float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]];
+                float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]);
+                score_lsq = score_lsq * score_lsq;
+    
+                unsigned int hits1[4];
+                hits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[innerMDIndex]];
+                hits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[outerMDIndex]];
+                hits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[innerMDIndex]];
+                hits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[outerMDIndex]];
+                addPixelSegmentToMemory(acc, segmentsInGPU, mdsInGPU, innerMDIndex, outerMDIndex, pixelModuleIndex, hits1, hitIndices0[tid], hitIndices2[tid], dPhiChange[tid], pixelSegmentIndex, tid, score_lsq);
+            }
+        }
+    };
 }
 
 #endif
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index 738037fa..b4564106 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -82,7 +82,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits& hitsInGPU)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits<TAcc>& hitsInGPU)
     {
         int phits1[4] = {-1,-1,-1,-1};
         int phits2[4] = {-1,-1,-1,-1};
@@ -245,7 +245,7 @@ namespace SDL
                 struct SDL::trackCandidates& trackCandidatesInGPU,
                 struct SDL::segments<TAcc>& segmentsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::hits& hitsInGPU,
+                struct SDL::hits<TAcc>& hitsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 5baa5a3f..1b1d1063 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -444,9 +444,6 @@ namespace SDL
         return false; // failsafe    
     };
 
-    template<typename TAcc>
-    void printTriplet(struct triplets& tripletsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, struct hits& hitsInGPU, struct modules& modulesInGPU, unsigned int tripletIndex);
-
     template<typename TAcc>
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT3(TAcc const & acc, float& betaIn, float& betaOut, float& betaAv, float & pt_beta, float sdIn_dr, float sdOut_dr, float dr, float lIn)
     {
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 2bf534b2..df6caf6a 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -7,7 +7,7 @@
 //____________________________________________________________________________________________
 std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHitIdxsAndHitTypes(SDL::Event* event, std::vector<unsigned int> hits)
 {
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hitidxs;
     std::vector<unsigned int> hittypes;
     for (auto& hit : hits)
@@ -48,7 +48,7 @@ std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pL
 //____________________________________________________________________________________________
 std::vector<unsigned int> getPixelHitIdxsFrompLS(SDL::Event* event, unsigned int pLS)
 {
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getPixelHitsFrompLS(event, pLS);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -203,7 +203,7 @@ std::vector<unsigned int> getHitsFromT5(SDL::Event* event, unsigned int T5)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFromT5(SDL::Event* event, unsigned int T5)
 {
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -215,7 +215,7 @@ std::vector<unsigned int> getModuleIdxsFromT5(SDL::Event* event, unsigned int T5
 {
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
     std::vector<unsigned int> module_idxs;
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
@@ -297,7 +297,7 @@ std::vector<unsigned int> getHitsFrompT3(SDL::Event* event, unsigned int pT3)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFrompT3(SDL::Event* event, unsigned int pT3)
 {
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFrompT3(event, pT3);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -309,7 +309,7 @@ std::vector<unsigned int> getModuleIdxsFrompT3(SDL::Event* event, unsigned int p
 {
     std::vector<unsigned int> hits = getOuterTrackerHitsFrompT3(event, pT3);
     std::vector<unsigned int> module_idxs;
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
@@ -405,7 +405,7 @@ std::vector<unsigned int> getHitsFrompT5(SDL::Event* event, unsigned int pT5)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFrompT5(SDL::Event* event, unsigned int pT5)
 {
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFrompT5(event, pT5);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -418,7 +418,7 @@ std::vector<unsigned int> getModuleIdxsFrompT5(SDL::Event* event, unsigned int p
 {
     std::vector<unsigned int> hits = getOuterTrackerHitsFrompT5(event, pT5);
     std::vector<unsigned int> module_idxs;
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index dcc9f070..cce8b6f1 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -477,7 +477,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
     SDL::triplets& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
-    SDL::hits& hitsInGPU = *(event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
     unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets;
@@ -561,7 +561,7 @@ void setGnnNtupleBranches(SDL::Event* event)
     // Get relevant information
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
@@ -717,7 +717,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD)
 {
     // Get relevant information
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     // Get the hit indices
     unsigned int hit0 = miniDoubletsInGPU.anchorHitIndices[MD];
@@ -822,7 +822,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     //
     // pictorial representation of a pT5
@@ -960,7 +960,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     //
     // pictorial representation of a pT3
@@ -1006,7 +1006,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx];
     std::vector<unsigned int> T3s = getT3sFromT5(event, T5);
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
@@ -1106,7 +1106,6 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f
 //________________________________________________________________________________________________________________________________
 void printHitMultiplicities(SDL::Event* event)
 {
-    //SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1152,7 +1151,7 @@ void printAllObjects(SDL::Event* event)
 void printMDs(SDL::Event* event)
 {
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1176,7 +1175,7 @@ void printLSs(SDL::Event* event)
 {
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1209,7 +1208,7 @@ void printpLSs(SDL::Event* event)
 {
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1240,7 +1239,7 @@ void printT3s(SDL::Event* event)
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits& hitsInGPU = (*event->getHits());
+    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     int nTriplets = 0;
     for (unsigned int i = 0; i <  *(modulesInGPU.nLowerModules); ++i)
@@ -1283,7 +1282,6 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
     SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    //SDL::hits& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     //int nTrackCandidates = 0;

From c2a60463023499aba088d6830915df2232dfcfa8 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Tue, 6 Jun 2023 20:58:23 -0400
Subject: [PATCH 16/44] Move Hit.cu + Segments.cu to inheritance technique

---
 SDL/Event.cu                  | 121 +++++++++++----------
 SDL/Event.cuh                 |  16 +--
 SDL/Hit.cuh                   | 101 +++++++++--------
 SDL/Kernels.cuh               |   5 +-
 SDL/LST.cc                    |   2 +-
 SDL/MiniDoublet.cuh           |   7 +-
 SDL/PixelTriplet.cuh          |  26 +++--
 SDL/Quintuplet.cuh            |  14 +--
 SDL/Segment.cuh               | 198 +++++++++++++++-------------------
 SDL/TrackCandidate.cuh        |  15 ++-
 SDL/Triplet.cuh               |  30 +++---
 code/core/AccessHelper.cc     |  20 ++--
 code/core/write_sdl_ntuple.cc |  40 +++----
 13 files changed, 293 insertions(+), 302 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index cb8e4747..01036c57 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -78,10 +78,10 @@ SDL::Event::~Event()
 #endif
     if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);}
     if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
-    if(segmentsInGPU != nullptr){delete segmentsInGPU;}
+    if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
     if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);}
     if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);}
-    if(hitsInGPU!= nullptr){delete hitsInGPU;}
+    if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
     if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
     if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);}
     if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);}
@@ -273,13 +273,13 @@ void SDL::Event::resetEvent()
             n_quintuplets_by_layer_endcap_[i] = 0;
         }
     }
-    if(hitsInGPU){delete hitsInGPU;
+    if(hitsInGPU){delete hitsInGPU; delete hitsBuffers;
       hitsInGPU = nullptr;}
     if(mdsInGPU){cms::cuda::free_host(mdsInGPU);
       mdsInGPU = nullptr;}
     if(rangesInGPU){cms::cuda::free_host(rangesInGPU);
       rangesInGPU = nullptr;}
-    if(segmentsInGPU){delete segmentsInGPU;
+    if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers;
       segmentsInGPU = nullptr;}
     if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU);
       tripletsInGPU = nullptr;}
@@ -470,7 +470,9 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     // Initialize space on device/host for next event.
     if (hitsInGPU == nullptr)
     {
-        hitsInGPU = new SDL::hits<Acc>(nModules, nHits, devAcc, queue);
+        hitsInGPU = new SDL::hits();
+        hitsBuffers = new SDL::hitsBuffer<Acc>(nModules, nHits, devAcc, queue);
+        hitsInGPU->setData(*hitsBuffers);
     }
 
     if (rangesInGPU == nullptr)
@@ -481,12 +483,12 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     }
 
     // Copy the host arrays to the GPU.
-    alpaka::memcpy(queue, hitsInGPU->xs_buf, x, nHits);
-    alpaka::memcpy(queue, hitsInGPU->ys_buf, y, nHits);
-    alpaka::memcpy(queue, hitsInGPU->zs_buf, z, nHits);
-    alpaka::memcpy(queue, hitsInGPU->detid_buf, detId, nHits);
-    alpaka::memcpy(queue, hitsInGPU->idxs_buf, idxInNtuple, nHits);
-    alpaka::memcpy(queue, hitsInGPU->nHits_buf, nHits_buf, 1);
+    alpaka::memcpy(queue, hitsBuffers->xs_buf, x, nHits);
+    alpaka::memcpy(queue, hitsBuffers->ys_buf, y, nHits);
+    alpaka::memcpy(queue, hitsBuffers->zs_buf, z, nHits);
+    alpaka::memcpy(queue, hitsBuffers->detid_buf, detId, nHits);
+    alpaka::memcpy(queue, hitsBuffers->idxs_buf, idxInNtuple, nHits);
+    alpaka::memcpy(queue, hitsBuffers->nHits_buf, nHits_buf, 1);
     alpaka::wait(queue);
 
     Vec const threadsPerBlock1(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
@@ -522,7 +524,7 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
         nLowerModules));
 
     // Waiting isn't needed after second kernel call. Saves ~100 us.
-    // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsinGPU->hitrange variables.
+    // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsBuffers->hitrange variables.
     // Also, modulesInGPU->partnerModuleIndices is not alterned in addPixelSegmentToEvent.
     alpaka::enqueue(queue, module_ranges_task);
 }
@@ -586,7 +588,9 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         cudaStreamSynchronize(stream);
         nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
 
-        segmentsInGPU = new SDL::segments<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU = new SDL::segments();
+        segmentsBuffers = new SDL::segmentsBuffer<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU->setData(*segmentsBuffers);
 
         cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
         cudaStreamSynchronize(stream);
@@ -604,19 +608,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
     alpaka::memcpy(queue, hitIndices3_dev, hitIndices3, size);
     alpaka::memcpy(queue, dPhiChange_dev, dPhiChange, size);
 
-    alpaka::memcpy(queue, segmentsInGPU->ptIn_buf, ptIn, size);
-    alpaka::memcpy(queue, segmentsInGPU->ptErr_buf, ptErr, size);
-    alpaka::memcpy(queue, segmentsInGPU->px_buf, px, size);
-    alpaka::memcpy(queue, segmentsInGPU->py_buf, py, size);
-    alpaka::memcpy(queue, segmentsInGPU->pz_buf, pz, size);
-    alpaka::memcpy(queue, segmentsInGPU->etaErr_buf, etaErr, size);
-    alpaka::memcpy(queue, segmentsInGPU->isQuad_buf, isQuad, size);
-    alpaka::memcpy(queue, segmentsInGPU->eta_buf, eta, size);
-    alpaka::memcpy(queue, segmentsInGPU->phi_buf, phi, size);
-    alpaka::memcpy(queue, segmentsInGPU->charge_buf, charge, size);
-    alpaka::memcpy(queue, segmentsInGPU->seedIdx_buf, seedIdx, size);
-    alpaka::memcpy(queue, segmentsInGPU->superbin_buf, superbin, size);
-    alpaka::memcpy(queue, segmentsInGPU->pixelType_buf, pixelType, size);
+    alpaka::memcpy(queue, segmentsBuffers->ptIn_buf, ptIn, size);
+    alpaka::memcpy(queue, segmentsBuffers->ptErr_buf, ptErr, size);
+    alpaka::memcpy(queue, segmentsBuffers->px_buf, px, size);
+    alpaka::memcpy(queue, segmentsBuffers->py_buf, py, size);
+    alpaka::memcpy(queue, segmentsBuffers->pz_buf, pz, size);
+    alpaka::memcpy(queue, segmentsBuffers->etaErr_buf, etaErr, size);
+    alpaka::memcpy(queue, segmentsBuffers->isQuad_buf, isQuad, size);
+    alpaka::memcpy(queue, segmentsBuffers->eta_buf, eta, size);
+    alpaka::memcpy(queue, segmentsBuffers->phi_buf, phi, size);
+    alpaka::memcpy(queue, segmentsBuffers->charge_buf, charge, size);
+    alpaka::memcpy(queue, segmentsBuffers->seedIdx_buf, seedIdx, size);
+    alpaka::memcpy(queue, segmentsBuffers->superbin_buf, superbin, size);
+    alpaka::memcpy(queue, segmentsBuffers->pixelType_buf, pixelType, size);
 
     cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
     cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
@@ -830,7 +834,9 @@ void SDL::Event::createSegmentsWithModuleMap()
 {
     if(segmentsInGPU == nullptr)
     {
-        segmentsInGPU = new SDL::segments<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU = new SDL::segments();
+        segmentsBuffers = new SDL::segmentsBuffer<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
+        segmentsInGPU->setData(*segmentsBuffers);
     }
 
     Vec const threadsPerBlockCreateSeg(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(64));
@@ -1810,42 +1816,44 @@ int SDL::Event::getNumberOfT5TrackCandidates()
     return nTrackCandidatesT5; 
 }
 
-SDL::hits<alpaka::DevCpu>* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection
+SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection
 {
     if(hitsInCPU == nullptr)
     {
         auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
-        alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1);
+        alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1);
         alpaka::wait(queue);
 
         unsigned int nHits = *alpaka::getPtrNative(nHits_buf);
-        hitsInCPU = new SDL::hits<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+        hitsInCPU = new SDL::hitsBuffer<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+        hitsInCPU->setData(*hitsInCPU);
 
         *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits;
-        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits);
-        alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsInGPU->detid_buf, nHits);
-        alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsInGPU->xs_buf, nHits);
-        alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsInGPU->ys_buf, nHits);
-        alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsInGPU->zs_buf, nHits);
-        alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsInGPU->moduleIndices_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsBuffers->detid_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsBuffers->xs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsBuffers->ys_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsBuffers->zs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsBuffers->moduleIndices_buf, nHits);
         alpaka::wait(queue);
     }
     return hitsInCPU;
 }
 
-SDL::hits<alpaka::DevCpu>* SDL::Event::getHitsInCMSSW()
+SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHitsInCMSSW()
 {
     if(hitsInCPU == nullptr)
     {
         auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
-        alpaka::memcpy(queue, nHits_buf, hitsInGPU->nHits_buf, 1);
+        alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1);
         alpaka::wait(queue);
 
         unsigned int nHits = *alpaka::getPtrNative(nHits_buf);
-        hitsInCPU = new SDL::hits<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+        hitsInCPU = new SDL::hitsBuffer<alpaka::DevCpu>(nModules, nHits, devHost, queue);
+        hitsInCPU->setData(*hitsInCPU);
 
         *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits;
-        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsInGPU->idxs_buf, nHits);
+        alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits);
         alpaka::wait(queue);
     }
     return hitsInCPU;
@@ -1858,7 +1866,7 @@ SDL::objectRanges* SDL::Event::getRanges()
         rangesInCPU = new SDL::objectRanges;
         rangesInCPU->hitRanges = new int[2*nModules];
         rangesInCPU->quintupletModuleIndices = new int[nLowerModules];
-        cudaMemcpyAsync(rangesInCPU->hitRanges, hitsInGPU->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
+        cudaMemcpyAsync(rangesInCPU->hitRanges, hitsBuffers->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
         rangesInCPU->miniDoubletModuleIndices = new int[nLowerModules+1];
         rangesInCPU->segmentModuleIndices = new int[nLowerModules + 1];
         rangesInCPU->tripletModuleIndices = new int[nLowerModules];
@@ -1897,31 +1905,32 @@ SDL::miniDoublets* SDL::Event::getMiniDoublets()
     return mdsInCPU;
 }
 
-SDL::segments<alpaka::DevCpu>* SDL::Event::getSegments()
+SDL::segmentsBuffer<alpaka::DevCpu>* SDL::Event::getSegments()
 {
     if(segmentsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based segmentsInCPU
         auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
-        alpaka::memcpy(queue, nMemLocal_buf, segmentsInGPU->nMemoryLocations_buf, 1);
+        alpaka::memcpy(queue, nMemLocal_buf, segmentsBuffers->nMemoryLocations_buf, 1);
         alpaka::wait(queue);
 
         unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
-        segmentsInCPU = new SDL::segments<alpaka::DevCpu>(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue);
+        segmentsInCPU = new SDL::segmentsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue);
+        segmentsInCPU->setData(*segmentsInCPU);
 
         *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemLocal;
-        alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsInGPU->nSegments_buf, (nLowerModules+1));
-        alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsInGPU->mdIndices_buf, 2 * nMemLocal);
-        alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->innerMiniDoubletAnchorHitIndices_buf, nMemLocal);
-        alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsInGPU->outerMiniDoubletAnchorHitIndices_buf, nMemLocal);
-        alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsInGPU->totOccupancySegments_buf, (nLowerModules+1));
-        alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsInGPU->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsInGPU->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsInGPU->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsInGPU->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsInGPU->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsInGPU->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-        alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsInGPU->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsBuffers->nSegments_buf, (nLowerModules+1));
+        alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsBuffers->mdIndices_buf, 2 * nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, segmentsBuffers->innerMiniDoubletAnchorHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, segmentsBuffers->outerMiniDoubletAnchorHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsBuffers->totOccupancySegments_buf, (nLowerModules+1));
+        alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsBuffers->ptIn_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsBuffers->eta_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsBuffers->phi_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsBuffers->seedIdx_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsBuffers->isDup_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsBuffers->isQuad_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsBuffers->score_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
         alpaka::wait(queue);
     }
     return segmentsInCPU;
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 4b431d33..0a24a210 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -40,9 +40,11 @@ namespace SDL
         int dev;
         int nTotalSegments;
         struct objectRanges* rangesInGPU;
-        struct hits<Acc>* hitsInGPU;
+        struct hits* hitsInGPU;
+        struct hitsBuffer<Acc>* hitsBuffers;
         struct miniDoublets* mdsInGPU;
-        struct segments<Acc>* segmentsInGPU;
+        struct segments* segmentsInGPU;
+        struct segmentsBuffer<Acc>* segmentsBuffers;
         struct triplets* tripletsInGPU;
         struct quintuplets* quintupletsInGPU;
         struct trackCandidates* trackCandidatesInGPU;
@@ -51,9 +53,9 @@ namespace SDL
 
         //CPU interface stuff
         objectRanges* rangesInCPU;
-        hits<alpaka::DevCpu>* hitsInCPU;
+        hitsBuffer<alpaka::DevCpu>* hitsInCPU;
         miniDoublets* mdsInCPU;
-        segments<alpaka::DevCpu>* segmentsInCPU;
+        segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
         triplets* tripletsInCPU;
         trackCandidates* trackCandidatesInCPU;
         modules* modulesInCPU;
@@ -130,10 +132,10 @@ namespace SDL
         unsigned int getNumberOfT3T3ExtendedTracks();
 
         objectRanges* getRanges();
-        hits<alpaka::DevCpu>* getHits();
-        hits<alpaka::DevCpu>* getHitsInCMSSW();
+        hitsBuffer<alpaka::DevCpu>* getHits();
+        hitsBuffer<alpaka::DevCpu>* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
-        segments<alpaka::DevCpu>* getSegments() ;
+        segmentsBuffer<alpaka::DevCpu>* getSegments() ;
         triplets* getTriplets();
         quintuplets* getQuintuplets();
         trackCandidates* getTrackCandidates();
diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index a2502d11..61a26cbd 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -6,29 +6,8 @@
 
 namespace SDL
 {
-    template<typename TAcc>
     struct hits
     {
-        Buf<TAcc, unsigned int> nHits_buf;
-        Buf<TAcc, float> xs_buf;
-        Buf<TAcc, float> ys_buf;
-        Buf<TAcc, float> zs_buf;
-        Buf<TAcc, uint16_t> moduleIndices_buf;
-        Buf<TAcc, unsigned int> idxs_buf;
-        Buf<TAcc, unsigned int> detid_buf;
-        Buf<TAcc, float> rts_buf;
-        Buf<TAcc, float> phis_buf;
-        Buf<TAcc, float> etas_buf;
-        Buf<TAcc, float> highEdgeXs_buf;
-        Buf<TAcc, float> highEdgeYs_buf;
-        Buf<TAcc, float> lowEdgeXs_buf;
-        Buf<TAcc, float> lowEdgeYs_buf;
-        Buf<TAcc, int> hitRanges_buf;
-        Buf<TAcc, int> hitRangesLower_buf;
-        Buf<TAcc, int> hitRangesUpper_buf;
-        Buf<TAcc, int8_t> hitRangesnLower_buf;
-        Buf<TAcc, int8_t> hitRangesnUpper_buf;
-
         unsigned int* nHits;
         float* xs;
         float* ys;
@@ -49,11 +28,59 @@ namespace SDL
         int8_t* hitRangesnLower;
         int8_t* hitRangesnUpper;
 
+        template<typename TBuff>
+        void setData(TBuff& hitsbuf)
+        {
+            nHits = alpaka::getPtrNative(hitsbuf.nHits_buf);
+            xs = alpaka::getPtrNative(hitsbuf.xs_buf);
+            ys = alpaka::getPtrNative(hitsbuf.ys_buf);
+            zs = alpaka::getPtrNative(hitsbuf.zs_buf);
+            moduleIndices = alpaka::getPtrNative(hitsbuf.moduleIndices_buf);
+            idxs = alpaka::getPtrNative(hitsbuf.idxs_buf);
+            detid = alpaka::getPtrNative(hitsbuf.detid_buf);
+            rts = alpaka::getPtrNative(hitsbuf.rts_buf);
+            phis = alpaka::getPtrNative(hitsbuf.phis_buf);
+            etas = alpaka::getPtrNative(hitsbuf.etas_buf);
+            highEdgeXs = alpaka::getPtrNative(hitsbuf.highEdgeXs_buf);
+            highEdgeYs = alpaka::getPtrNative(hitsbuf.highEdgeYs_buf);
+            lowEdgeXs = alpaka::getPtrNative(hitsbuf.lowEdgeXs_buf);
+            lowEdgeYs = alpaka::getPtrNative(hitsbuf.lowEdgeYs_buf);
+            hitRanges = alpaka::getPtrNative(hitsbuf.hitRanges_buf);
+            hitRangesLower = alpaka::getPtrNative(hitsbuf.hitRangesLower_buf);
+            hitRangesUpper = alpaka::getPtrNative(hitsbuf.hitRangesUpper_buf);
+            hitRangesnLower = alpaka::getPtrNative(hitsbuf.hitRangesnLower_buf);
+            hitRangesnUpper = alpaka::getPtrNative(hitsbuf.hitRangesnUpper_buf);
+        }
+    };
+
+    template<typename TAcc>
+    struct hitsBuffer : hits
+    {
+        Buf<TAcc, unsigned int> nHits_buf;
+        Buf<TAcc, float> xs_buf;
+        Buf<TAcc, float> ys_buf;
+        Buf<TAcc, float> zs_buf;
+        Buf<TAcc, uint16_t> moduleIndices_buf;
+        Buf<TAcc, unsigned int> idxs_buf;
+        Buf<TAcc, unsigned int> detid_buf;
+        Buf<TAcc, float> rts_buf;
+        Buf<TAcc, float> phis_buf;
+        Buf<TAcc, float> etas_buf;
+        Buf<TAcc, float> highEdgeXs_buf;
+        Buf<TAcc, float> highEdgeYs_buf;
+        Buf<TAcc, float> lowEdgeXs_buf;
+        Buf<TAcc, float> lowEdgeYs_buf;
+        Buf<TAcc, int> hitRanges_buf;
+        Buf<TAcc, int> hitRangesLower_buf;
+        Buf<TAcc, int> hitRangesUpper_buf;
+        Buf<TAcc, int8_t> hitRangesnLower_buf;
+        Buf<TAcc, int8_t> hitRangesnUpper_buf;
+
         template<typename TQueue, typename TDevAcc>
-        hits(unsigned int nModules,
-             unsigned int nMaxHits,
-             TDevAcc const & devAccIn,
-             TQueue& queue) :
+        hitsBuffer(unsigned int nModules,
+                    unsigned int nMaxHits,
+                    TDevAcc const & devAccIn,
+                    TQueue& queue) :
             nHits_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
             xs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
             ys_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
@@ -74,26 +101,6 @@ namespace SDL
             hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
             hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules))
         {
-            nHits = alpaka::getPtrNative(nHits_buf);
-            xs = alpaka::getPtrNative(xs_buf);
-            ys = alpaka::getPtrNative(ys_buf);
-            zs = alpaka::getPtrNative(zs_buf);
-            moduleIndices = alpaka::getPtrNative(moduleIndices_buf);
-            idxs = alpaka::getPtrNative(idxs_buf);
-            detid = alpaka::getPtrNative(detid_buf);
-            rts = alpaka::getPtrNative(rts_buf);
-            phis = alpaka::getPtrNative(phis_buf);
-            etas = alpaka::getPtrNative(etas_buf);
-            highEdgeXs = alpaka::getPtrNative(highEdgeXs_buf);
-            highEdgeYs = alpaka::getPtrNative(highEdgeYs_buf);
-            lowEdgeXs = alpaka::getPtrNative(lowEdgeXs_buf);
-            lowEdgeYs = alpaka::getPtrNative(lowEdgeYs_buf);
-            hitRanges = alpaka::getPtrNative(hitRanges_buf);
-            hitRangesLower = alpaka::getPtrNative(hitRangesLower_buf);
-            hitRangesUpper = alpaka::getPtrNative(hitRangesUpper_buf);
-            hitRangesnLower = alpaka::getPtrNative(hitRangesnLower_buf);
-            hitRangesnUpper = alpaka::getPtrNative(hitRangesnUpper_buf);
-
             alpaka::memset(queue, hitRanges_buf, -1, nModules*2);
             alpaka::memset(queue, hitRangesLower_buf, -1, nModules);
             alpaka::memset(queue, hitRangesUpper_buf, -1, nModules);
@@ -213,7 +220,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
             TAcc const & acc,
             struct SDL::modules& modulesInGPU,
-            struct SDL::hits<TAcc>& hitsInGPU,
+            struct SDL::hits& hitsInGPU,
             int const & nLowerModules) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -250,7 +257,7 @@ namespace SDL
             unsigned int* geoMapDetId, // DetId's from endcap map
             float* geoMapPhi, // Phi values from endcap map
             struct SDL::modules& modulesInGPU,
-            struct SDL::hits<TAcc>& hitsInGPU,
+            struct SDL::hits& hitsInGPU,
             unsigned int const & nHits) const // Total number of hits in event
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index 4cc1310d..8fd7d952 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -27,8 +27,7 @@ namespace SDL
         pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = 1;
     };
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments<TAcc>& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex)
     {
         segmentsInGPU.isDup[pixelSegmentArrayIndex] = 1;
     };
@@ -453,7 +452,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 bool secondpass) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/LST.cc b/SDL/LST.cc
index 83481428..a46a6167 100644
--- a/SDL/LST.cc
+++ b/SDL/LST.cc
@@ -403,7 +403,7 @@ void SDL::LST::getOutput(SDL::Event& event) {
     std::vector<int> tc_seedIdx_;
     std::vector<short> tc_trackCandidateType_;
 
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event.getHitsInCMSSW());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event.getHitsInCMSSW());
     SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW());
 
     unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates;
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index cfc81f55..850f01d8 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -68,8 +68,7 @@ namespace SDL
 
     void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits<TAcc>& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
     {
         //the index into which this MD needs to be written will be computed in the kernel
         //nMDs variable will be incremented in the kernel, no need to worry about that here
@@ -660,7 +659,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
-                struct SDL::hits<TAcc>& hitsInGPU,
+                struct SDL::hits& hitsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
@@ -804,7 +803,7 @@ namespace SDL
                 struct modules& modulesInGPU,
                 struct miniDoublets& mdsInGPU,
                 struct objectRanges& rangesInGPU,
-                struct SDL::hits<TAcc>& hitsInGPU) const
+                struct SDL::hits& hitsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index f7aaa54e..660aaeb7 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -48,8 +48,7 @@ namespace SDL
 
     void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
     {
         pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex;
         pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex;
@@ -131,7 +130,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
@@ -664,7 +663,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
     {
         bool pass = true;
 
@@ -769,7 +768,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 unsigned int* connectedPixelSize,
@@ -912,7 +911,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -1125,7 +1124,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -1386,8 +1385,7 @@ namespace SDL
 
     void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream);
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
     {
         pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex;
         pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index;
@@ -1968,7 +1966,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
     {
         bool pass = true;
 
@@ -2102,7 +2100,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
@@ -2228,7 +2226,7 @@ namespace SDL
     };
  
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -2435,7 +2433,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -2650,7 +2648,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index b4fe6b3d..dc7893a0 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -645,7 +645,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, struct SDL::segments& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex)
     {
         unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1];
         unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex];
@@ -1205,7 +1205,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -1398,7 +1398,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -1609,7 +1609,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1816,7 +1816,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1874,7 +1874,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::triplets& tripletsInGPU, uint16_t& lowerModuleIndex1, uint16_t& lowerModuleIndex2, uint16_t& lowerModuleIndex3, uint16_t& lowerModuleIndex4, uint16_t& lowerModuleIndex5, unsigned int& innerTripletIndex, unsigned int& outerTripletIndex, float& innerRadius, float& outerRadius, float& bridgeRadius, float& regressionG, float& regressionF, float& regressionRadius, float& rzChiSquared, float& chiSquared, float& nonAnchorChiSquared, bool& TightCutFlag)
     {
         bool pass = true;
         unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex];
@@ -2078,7 +2078,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 85a1df54..88f5718f 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -10,84 +10,30 @@
 
 namespace SDL
 {
-    template<typename TAcc>
     struct segments
     {
-        // Buffer objects for each member variable
-        Buf<TAcc, FPX> dPhis_buf;
-        Buf<TAcc, FPX> dPhiMins_buf;
-        Buf<TAcc, FPX> dPhiMaxs_buf;
-        Buf<TAcc, FPX> dPhiChanges_buf;
-        Buf<TAcc, FPX> dPhiChangeMins_buf;
-        Buf<TAcc, FPX> dPhiChangeMaxs_buf;
-
-        Buf<TAcc, uint16_t> innerLowerModuleIndices_buf;
-        Buf<TAcc, uint16_t> outerLowerModuleIndices_buf;
-
-        Buf<TAcc, unsigned int> seedIdx_buf;
-        Buf<TAcc, unsigned int> mdIndices_buf;
-        Buf<TAcc, unsigned int> nMemoryLocations_buf;
-        Buf<TAcc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
-        Buf<TAcc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
-
-        Buf<TAcc, int> charge_buf;
-        Buf<TAcc, int> superbin_buf;
-        Buf<TAcc, int> nSegments_buf;
-        Buf<TAcc, int> totOccupancySegments_buf;
-
-        Buf<TAcc, uint4> pLSHitsIdxs_buf;
-
-        Buf<TAcc, int8_t> pixelType_buf;
-
-        Buf<TAcc, char> isQuad_buf;
-
-        Buf<TAcc, bool> isDup_buf;
-        Buf<TAcc, bool> partOfPT5_buf;
-
-        Buf<TAcc, float> ptIn_buf;
-        Buf<TAcc, float> ptErr_buf;
-        Buf<TAcc, float> px_buf;
-        Buf<TAcc, float> py_buf;
-        Buf<TAcc, float> pz_buf;
-        Buf<TAcc, float> etaErr_buf;
-        Buf<TAcc, float> eta_buf;
-        Buf<TAcc, float> phi_buf;
-        Buf<TAcc, float> score_buf;
-        Buf<TAcc, float> circleCenterX_buf;
-        Buf<TAcc, float> circleCenterY_buf;
-        Buf<TAcc, float> circleRadius_buf;
-
-        // Pointers towards the data of each buffer
         FPX* dPhis;
         FPX* dPhiMins;
         FPX* dPhiMaxs;
         FPX* dPhiChanges;
         FPX* dPhiChangeMins;
         FPX* dPhiChangeMaxs;
-
         uint16_t* innerLowerModuleIndices;
         uint16_t* outerLowerModuleIndices;
-
         unsigned int* seedIdx;
         unsigned int* mdIndices;
         unsigned int* nMemoryLocations;
         unsigned int* innerMiniDoubletAnchorHitIndices;
         unsigned int* outerMiniDoubletAnchorHitIndices;
-
         int* charge;
         int* superbin;
         int* nSegments; //number of segments per inner lower module
         int* totOccupancySegments; //number of segments per inner lower module
-
         uint4* pLSHitsIdxs;
-
         int8_t* pixelType;
-
         char* isQuad;
-
         bool* isDup;
         bool* partOfPT5;
-
         float* ptIn;
         float* ptErr;
         float* px;
@@ -101,12 +47,90 @@ namespace SDL
         float* circleCenterY;
         float* circleRadius;
 
+        template<typename TBuff>
+        void setData(TBuff& segmentsbuf)
+        {
+            dPhis = alpaka::getPtrNative(segmentsbuf.dPhis_buf);
+            dPhiMins = alpaka::getPtrNative(segmentsbuf.dPhiMins_buf);
+            dPhiMaxs = alpaka::getPtrNative(segmentsbuf.dPhiMaxs_buf);
+            dPhiChanges = alpaka::getPtrNative(segmentsbuf.dPhiChanges_buf);
+            dPhiChangeMins = alpaka::getPtrNative(segmentsbuf.dPhiChangeMins_buf);
+            dPhiChangeMaxs = alpaka::getPtrNative(segmentsbuf.dPhiChangeMaxs_buf);
+            innerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.innerLowerModuleIndices_buf);
+            outerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.outerLowerModuleIndices_buf);
+            seedIdx = alpaka::getPtrNative(segmentsbuf.seedIdx_buf);
+            mdIndices = alpaka::getPtrNative(segmentsbuf.mdIndices_buf);
+            nMemoryLocations = alpaka::getPtrNative(segmentsbuf.nMemoryLocations_buf);
+            innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.innerMiniDoubletAnchorHitIndices_buf);
+            outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.outerMiniDoubletAnchorHitIndices_buf);
+            charge = alpaka::getPtrNative(segmentsbuf.charge_buf);
+            superbin = alpaka::getPtrNative(segmentsbuf.superbin_buf);
+            nSegments = alpaka::getPtrNative(segmentsbuf.nSegments_buf);
+            totOccupancySegments = alpaka::getPtrNative(segmentsbuf.totOccupancySegments_buf);
+            pLSHitsIdxs = alpaka::getPtrNative(segmentsbuf.pLSHitsIdxs_buf);
+            pixelType = alpaka::getPtrNative(segmentsbuf.pixelType_buf);
+            isQuad = alpaka::getPtrNative(segmentsbuf.isQuad_buf);
+            isDup = alpaka::getPtrNative(segmentsbuf.isDup_buf);
+            partOfPT5 = alpaka::getPtrNative(segmentsbuf.partOfPT5_buf);
+            ptIn = alpaka::getPtrNative(segmentsbuf.ptIn_buf);
+            ptErr = alpaka::getPtrNative(segmentsbuf.ptErr_buf);
+            px = alpaka::getPtrNative(segmentsbuf.px_buf);
+            py = alpaka::getPtrNative(segmentsbuf.py_buf);
+            pz = alpaka::getPtrNative(segmentsbuf.pz_buf);
+            etaErr = alpaka::getPtrNative(segmentsbuf.etaErr_buf);
+            eta = alpaka::getPtrNative(segmentsbuf.eta_buf);
+            phi = alpaka::getPtrNative(segmentsbuf.phi_buf);
+            score = alpaka::getPtrNative(segmentsbuf.score_buf);
+            circleCenterX = alpaka::getPtrNative(segmentsbuf.circleCenterX_buf);
+            circleCenterY = alpaka::getPtrNative(segmentsbuf.circleCenterY_buf);
+            circleRadius = alpaka::getPtrNative(segmentsbuf.circleRadius_buf);
+        }
+    };
+
+    template<typename TAcc>
+    struct segmentsBuffer : segments
+    {
+        Buf<TAcc, FPX> dPhis_buf;
+        Buf<TAcc, FPX> dPhiMins_buf;
+        Buf<TAcc, FPX> dPhiMaxs_buf;
+        Buf<TAcc, FPX> dPhiChanges_buf;
+        Buf<TAcc, FPX> dPhiChangeMins_buf;
+        Buf<TAcc, FPX> dPhiChangeMaxs_buf;
+        Buf<TAcc, uint16_t> innerLowerModuleIndices_buf;
+        Buf<TAcc, uint16_t> outerLowerModuleIndices_buf;
+        Buf<TAcc, unsigned int> seedIdx_buf;
+        Buf<TAcc, unsigned int> mdIndices_buf;
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
+        Buf<TAcc, unsigned int> innerMiniDoubletAnchorHitIndices_buf;
+        Buf<TAcc, unsigned int> outerMiniDoubletAnchorHitIndices_buf;
+        Buf<TAcc, int> charge_buf;
+        Buf<TAcc, int> superbin_buf;
+        Buf<TAcc, int> nSegments_buf;
+        Buf<TAcc, int> totOccupancySegments_buf;
+        Buf<TAcc, uint4> pLSHitsIdxs_buf;
+        Buf<TAcc, int8_t> pixelType_buf;
+        Buf<TAcc, char> isQuad_buf;
+        Buf<TAcc, bool> isDup_buf;
+        Buf<TAcc, bool> partOfPT5_buf;
+        Buf<TAcc, float> ptIn_buf;
+        Buf<TAcc, float> ptErr_buf;
+        Buf<TAcc, float> px_buf;
+        Buf<TAcc, float> py_buf;
+        Buf<TAcc, float> pz_buf;
+        Buf<TAcc, float> etaErr_buf;
+        Buf<TAcc, float> eta_buf;
+        Buf<TAcc, float> phi_buf;
+        Buf<TAcc, float> score_buf;
+        Buf<TAcc, float> circleCenterX_buf;
+        Buf<TAcc, float> circleCenterY_buf;
+        Buf<TAcc, float> circleRadius_buf;
+
         template<typename TQueue, typename TDevAcc>
-        segments(unsigned int nMemoryLocationsIn,
-                    uint16_t nLowerModules,
-                    unsigned int maxPixelSegments,
-                    TDevAcc const & devAccIn,
-                    TQueue& queue) :
+        segmentsBuffer(unsigned int nMemoryLocationsIn,
+                        uint16_t nLowerModules,
+                        unsigned int maxPixelSegments,
+                        TDevAcc const & devAccIn,
+                        TQueue& queue) :
             dPhis_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
             dPhiMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
@@ -142,49 +166,6 @@ namespace SDL
             circleCenterY_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
             circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments))
         {
-            dPhis = alpaka::getPtrNative(dPhis_buf);
-            dPhiMins = alpaka::getPtrNative(dPhiMins_buf);
-            dPhiMaxs = alpaka::getPtrNative(dPhiMaxs_buf);
-            dPhiChanges = alpaka::getPtrNative(dPhiChanges_buf);
-            dPhiChangeMins = alpaka::getPtrNative(dPhiChangeMins_buf);
-            dPhiChangeMaxs = alpaka::getPtrNative(dPhiChangeMaxs_buf);
-
-            innerLowerModuleIndices = alpaka::getPtrNative(innerLowerModuleIndices_buf);
-            outerLowerModuleIndices = alpaka::getPtrNative(outerLowerModuleIndices_buf);
-
-            seedIdx = alpaka::getPtrNative(seedIdx_buf);
-            mdIndices = alpaka::getPtrNative(mdIndices_buf);
-            nMemoryLocations = alpaka::getPtrNative(nMemoryLocations_buf);
-            innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(innerMiniDoubletAnchorHitIndices_buf);
-            outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(outerMiniDoubletAnchorHitIndices_buf);
-
-            charge = alpaka::getPtrNative(charge_buf);
-            superbin = alpaka::getPtrNative(superbin_buf);
-            nSegments = alpaka::getPtrNative(nSegments_buf);
-            totOccupancySegments = alpaka::getPtrNative(totOccupancySegments_buf);
-
-            pLSHitsIdxs = alpaka::getPtrNative(pLSHitsIdxs_buf);
-
-            pixelType = alpaka::getPtrNative(pixelType_buf);
-
-            isQuad = alpaka::getPtrNative(isQuad_buf);
-
-            isDup = alpaka::getPtrNative(isDup_buf);
-            partOfPT5 = alpaka::getPtrNative(partOfPT5_buf);
-
-            ptIn = alpaka::getPtrNative(ptIn_buf);
-            ptErr = alpaka::getPtrNative(ptErr_buf);
-            px = alpaka::getPtrNative(px_buf);
-            py = alpaka::getPtrNative(py_buf);
-            pz = alpaka::getPtrNative(pz_buf);
-            etaErr = alpaka::getPtrNative(etaErr_buf);
-            eta = alpaka::getPtrNative(eta_buf);
-            phi = alpaka::getPtrNative(phi_buf);
-            score = alpaka::getPtrNative(score_buf);
-            circleCenterX = alpaka::getPtrNative(circleCenterX_buf);
-            circleCenterY = alpaka::getPtrNative(circleCenterY_buf);
-            circleRadius = alpaka::getPtrNative(circleRadius_buf);
-
             alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1);
             alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1);
             alpaka::memset(queue, partOfPT5_buf, 0u, maxPixelSegments);
@@ -450,8 +431,7 @@ namespace SDL
         dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls);
     };
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments<TAcc>& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDAnchorHitIndex, unsigned int outerMDAnchorHitIndex, float& dPhi, float& dPhiMin, float& dPhiMax, float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax, unsigned int idx)
     {
         //idx will be computed in the kernel, which is the index into which the 
         //segment will be written
@@ -473,7 +453,7 @@ namespace SDL
     }
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments<TAcc>& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const & acc, struct SDL::segments& segmentsInGPU, struct miniDoublets& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, unsigned int hitIdxs[4], unsigned int innerAnchorHitIndex, unsigned int outerAnchorHitIndex, float dPhiChange, unsigned int idx, unsigned int pixelSegmentArrayIndex, float score)
     {
         segmentsInGPU.mdIndices[idx * 2] = innerMDIndex;
         segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex;
@@ -704,7 +684,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -856,7 +836,7 @@ namespace SDL
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
                 struct modules& modulesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -890,9 +870,9 @@ namespace SDL
             TAcc const & acc,
             struct SDL::modules& modulesInGPU,
             struct SDL::objectRanges& rangesInGPU,
-            struct SDL::hits<TAcc>& hitsInGPU,
+            struct SDL::hits& hitsInGPU,
             struct SDL::miniDoublets& mdsInGPU,
-            struct SDL::segments<TAcc>& segmentsInGPU,
+            struct SDL::segments& segmentsInGPU,
             unsigned int* hitIndices0,
             unsigned int* hitIndices1,
             unsigned int* hitIndices2,
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index b4564106..d81a570d 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -81,8 +81,7 @@ namespace SDL
         trackCandidatesInGPU.radius[trackCandidateIndex]  = __F2H(radius);
     };
 
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct SDL::hits<TAcc>& hitsInGPU)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct SDL::hits& hitsInGPU)
     {
         int phits1[4] = {-1,-1,-1,-1};
         int phits2[4] = {-1,-1,-1,-1};
@@ -128,7 +127,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -243,9 +242,9 @@ namespace SDL
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::hits<TAcc>& hitsInGPU,
+                struct SDL::hits& hitsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -328,7 +327,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
@@ -399,7 +398,7 @@ namespace SDL
                 TAcc const & acc,
                 uint16_t nLowerModules,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU) const
+                struct SDL::segments& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
@@ -429,7 +428,7 @@ namespace SDL
                 uint16_t nLowerModules,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
                 struct SDL::trackCandidates& trackCandidatesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::objectRanges& rangesInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 1b1d1063..8a9fc96f 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -59,11 +59,9 @@ namespace SDL
     void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream);
 
 #ifdef CUT_VALUE_DEBUG
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
 #else
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
 #endif
     {
         tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex;
@@ -110,7 +108,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
     {
         //get the rt and z
         const float& r1 = mdsInGPU.anchorRt[firstMDIndex];
@@ -191,7 +189,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -250,7 +248,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex;
@@ -329,7 +327,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -409,7 +407,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex];
         short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex];
@@ -490,7 +488,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -686,7 +684,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -901,7 +899,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1110,7 +1108,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1175,7 +1173,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments<TAcc>& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1202,7 +1200,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU,
+                struct SDL::segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 uint16_t *index_gpu,
@@ -1277,7 +1275,7 @@ namespace SDL
                 TAcc const & acc,
                 struct modules& modulesInGPU,
                 struct objectRanges& rangesInGPU,
-                struct SDL::segments<TAcc>& segmentsInGPU) const
+                struct SDL::segments& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index df6caf6a..3ab470fd 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -7,7 +7,7 @@
 //____________________________________________________________________________________________
 std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHitIdxsAndHitTypes(SDL::Event* event, std::vector<unsigned int> hits)
 {
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hitidxs;
     std::vector<unsigned int> hittypes;
     for (auto& hit : hits)
@@ -28,7 +28,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS)
 {
-    SDL::segments<alpaka::DevCpu>& segments_ = *(event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segments_ = *(event->getSegments());
     SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -48,7 +48,7 @@ std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pL
 //____________________________________________________________________________________________
 std::vector<unsigned int> getPixelHitIdxsFrompLS(SDL::Event* event, unsigned int pLS)
 {
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getPixelHitsFrompLS(event, pLS);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -96,7 +96,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getMDsFromLS(SDL::Event* event, unsigned int LS)
 {
-    SDL::segments<alpaka::DevCpu>& segments_ = *(event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segments_ = *(event->getSegments());
     unsigned int MD_1 = segments_.mdIndices[2 * LS];
     unsigned int MD_2 = segments_.mdIndices[2 * LS + 1];
     return {MD_1, MD_2};
@@ -203,7 +203,7 @@ std::vector<unsigned int> getHitsFromT5(SDL::Event* event, unsigned int T5)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFromT5(SDL::Event* event, unsigned int T5)
 {
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -215,7 +215,7 @@ std::vector<unsigned int> getModuleIdxsFromT5(SDL::Event* event, unsigned int T5
 {
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
     std::vector<unsigned int> module_idxs;
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
@@ -297,7 +297,7 @@ std::vector<unsigned int> getHitsFrompT3(SDL::Event* event, unsigned int pT3)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFrompT3(SDL::Event* event, unsigned int pT3)
 {
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFrompT3(event, pT3);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -309,7 +309,7 @@ std::vector<unsigned int> getModuleIdxsFrompT3(SDL::Event* event, unsigned int p
 {
     std::vector<unsigned int> hits = getOuterTrackerHitsFrompT3(event, pT3);
     std::vector<unsigned int> module_idxs;
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
@@ -405,7 +405,7 @@ std::vector<unsigned int> getHitsFrompT5(SDL::Event* event, unsigned int pT5)
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitIdxsFrompT5(SDL::Event* event, unsigned int pT5)
 {
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     std::vector<unsigned int> hits = getHitsFrompT5(event, pT5);
     std::vector<unsigned int> hitidxs;
     for (auto& hit : hits)
@@ -418,7 +418,7 @@ std::vector<unsigned int> getModuleIdxsFrompT5(SDL::Event* event, unsigned int p
 {
     std::vector<unsigned int> hits = getOuterTrackerHitsFrompT5(event, pT5);
     std::vector<unsigned int> module_idxs;
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     for(auto &hitIdx:hits)
     {
         module_idxs.push_back(hitsInGPU.moduleIndices[hitIdx]);
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index cce8b6f1..43d7831f 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -307,7 +307,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
     // ============ pT5 =============
     SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
     SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::modules& modulesInGPU = (*event->getModules());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -476,8 +476,8 @@ void setPixelTripletOutputBranches(SDL::Event* event)
     SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets());
     SDL::triplets& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
     unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets;
@@ -559,9 +559,9 @@ void setPixelTripletOutputBranches(SDL::Event* event)
 void setGnnNtupleBranches(SDL::Event* event)
 {
     // Get relevant information
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
@@ -717,7 +717,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD)
 {
     // Get relevant information
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     // Get the hit indices
     unsigned int hit0 = miniDoubletsInGPU.anchorHitIndices[MD];
@@ -821,8 +821,8 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     //
     // pictorial representation of a pT5
@@ -959,8 +959,8 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     //
     // pictorial representation of a pT3
@@ -1006,7 +1006,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx];
     std::vector<unsigned int> T3s = getT3sFromT5(event, T5);
     std::vector<unsigned int> hits = getHitsFromT5(event, T5);
@@ -1059,7 +1059,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepLS(SDL::Event* event, unsigned int idx)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
 
     // Getting pLS index
     unsigned int pLS = trackCandidatesInGPU.directObjectIndices[idx];
@@ -1151,7 +1151,7 @@ void printAllObjects(SDL::Event* event)
 void printMDs(SDL::Event* event)
 {
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1173,9 +1173,9 @@ void printMDs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printLSs(SDL::Event* event)
 {
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1206,9 +1206,9 @@ void printLSs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printpLSs(SDL::Event* event)
 {
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());
 
@@ -1237,9 +1237,9 @@ void printpLSs(SDL::Event* event)
 void printT3s(SDL::Event* event)
 {
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::hits<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
+    SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     int nTriplets = 0;
     for (unsigned int i = 0; i <  *(modulesInGPU.nLowerModules); ++i)
@@ -1280,7 +1280,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::triplets& tripletsInGPU = (*event->getTriplets());
-    SDL::segments<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
+    SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRanges& rangesInGPU = (*event->getRanges());

From d42ebb047a71ac73bd17749a87eddcc5651159b5 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 7 Jun 2023 19:11:34 -0400
Subject: [PATCH 17/44] Move objectranges to Alpaka memory

---
 SDL/Event.cu                  |  70 +++++++----------
 SDL/Event.cuh                 |   5 +-
 SDL/Module.cu                 | 140 ----------------------------------
 SDL/Module.cuh                | 116 +++++++++++++++++++++++++++-
 code/core/AccessHelper.cc     |   6 +-
 code/core/write_sdl_ntuple.cc |  14 ++--
 6 files changed, 153 insertions(+), 198 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 01036c57..3d7d0805 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -60,7 +60,6 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
 SDL::Event::~Event()
 {
 #ifdef CACHE_ALLOC
-    if(rangesInGPU){rangesInGPU->freeMemoryCache();}
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
@@ -68,7 +67,6 @@ SDL::Event::~Event()
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
-    if(rangesInGPU){rangesInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
@@ -76,7 +74,7 @@ SDL::Event::~Event()
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);}
 #endif
-    if(rangesInGPU != nullptr){cms::cuda::free_host(rangesInGPU);}
+    if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
     if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
     if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
     if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);}
@@ -92,7 +90,6 @@ SDL::Event::~Event()
     }
     if(rangesInCPU != nullptr)
     {
-        delete[] rangesInCPU->quintupletModuleIndices;
         delete rangesInCPU;
     }
 
@@ -240,14 +237,12 @@ void SDL::Event::resetEvent()
 #ifdef CACHE_ALLOC
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
-    if(rangesInGPU){rangesInGPU->freeMemoryCache();}
     if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
-    if(rangesInGPU){rangesInGPU->freeMemory();}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
@@ -277,7 +272,7 @@ void SDL::Event::resetEvent()
       hitsInGPU = nullptr;}
     if(mdsInGPU){cms::cuda::free_host(mdsInGPU);
       mdsInGPU = nullptr;}
-    if(rangesInGPU){cms::cuda::free_host(rangesInGPU);
+    if(rangesInGPU){delete rangesInGPU; delete rangesBuffers;
       rangesInGPU = nullptr;}
     if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers;
       segmentsInGPU = nullptr;}
@@ -299,8 +294,6 @@ void SDL::Event::resetEvent()
     }
     if(rangesInCPU != nullptr)
     {
-        delete[] rangesInCPU->hitRanges;
-        delete[] rangesInCPU->quintupletModuleIndices;
         delete rangesInCPU;
         rangesInCPU = nullptr;
     }
@@ -440,7 +433,6 @@ void SDL::initModules(const char* moduleMetaDataFilePath)
         loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath);
         cudaStreamSynchronize(default_stream);
     }
-    //resetObjectRanges(*modulesInGPU,nModules, default_stream);
 }
 
 void SDL::cleanModules()
@@ -450,11 +442,6 @@ void SDL::cleanModules()
     cudaFreeHost(pixelMapping);
 }
 
-void SDL::Event::resetObjectsInModule()
-{
-    resetObjectRanges(*rangesInGPU,nModules,stream);
-}
-
 void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple)
 {
     // Use the actual number of hits instead of a max.
@@ -477,9 +464,22 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
 
     if (rangesInGPU == nullptr)
     {
-        rangesInGPU = (SDL::objectRanges*)cms::cuda::allocate_host(sizeof(SDL::objectRanges), stream);
-        createRangesInExplicitMemory(*rangesInGPU, nModules, stream, nLowerModules);
-        resetObjectsInModule();
+        rangesInGPU = new SDL::objectRanges();
+        rangesBuffers = new SDL::objectRangesBuffer<Acc>(nModules, nLowerModules, devAcc, queue);
+        rangesInGPU->setData(*rangesBuffers);
+    }
+
+    unsigned int hostValue;
+
+    // Copy from device to host
+    cudaError_t err = cudaMemcpy(&hostValue, &rangesInGPU->hitRangesnUpper[0], sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+    // Check for errors
+    if (err != cudaSuccess) {
+        printf("cudaMemcpy failed with error: %s\n", cudaGetErrorString(err));
+    } else {
+        // Print the value
+        printf("The value is: %u\n", hostValue);
     }
 
     // Copy the host arrays to the GPU.
@@ -754,10 +754,8 @@ void SDL::Event::createMiniDoublets()
     if(mdsInGPU == nullptr)
     {
         mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
-
         //FIXME: Add memory locations for pixel MDs
         createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, stream);
-
     }
     cudaStreamSynchronize(stream);
 
@@ -1254,14 +1252,6 @@ void SDL::Event::createPixelTriplets()
 void SDL::Event::createQuintuplets()
 {
     uint16_t nEligibleT5Modules = 0;
-
-#ifdef CACHE_ALLOC
-    rangesInGPU->indicesOfEligibleT5Modules = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(uint16_t), stream);
-#else
-    cudaMalloc(&(rangesInGPU->indicesOfEligibleT5Modules), nLowerModules * sizeof(uint16_t));
-#endif
-    cudaMemsetAsync(rangesInGPU->quintupletModuleIndices, -1, sizeof(int) * (nLowerModules),stream);
-    cudaStreamSynchronize(stream);
     unsigned int nTotalQuintuplets;
 
     Vec const threadsPerBlockCreateQuints(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
@@ -1308,7 +1298,6 @@ void SDL::Event::createQuintuplets()
         nEligibleT5Modules));
 
     alpaka::enqueue(queue, createQuintupletsInGPUv2Task);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlockDupQuint(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridDupQuint(static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1), static_cast<Idx>(1));
@@ -1859,22 +1848,19 @@ SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHitsInCMSSW()
     return hitsInCPU;
 }
 
-SDL::objectRanges* SDL::Event::getRanges()
+SDL::objectRangesBuffer<alpaka::DevCpu>* SDL::Event::getRanges()
 {
     if(rangesInCPU == nullptr)
     {
-        rangesInCPU = new SDL::objectRanges;
-        rangesInCPU->hitRanges = new int[2*nModules];
-        rangesInCPU->quintupletModuleIndices = new int[nLowerModules];
-        cudaMemcpyAsync(rangesInCPU->hitRanges, hitsBuffers->hitRanges, 2*nModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        rangesInCPU->miniDoubletModuleIndices = new int[nLowerModules+1];
-        rangesInCPU->segmentModuleIndices = new int[nLowerModules + 1];
-        rangesInCPU->tripletModuleIndices = new int[nLowerModules];
-        cudaMemcpyAsync(rangesInCPU->quintupletModuleIndices, rangesInGPU->quintupletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(rangesInCPU->miniDoubletModuleIndices, rangesInGPU->miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(rangesInCPU->segmentModuleIndices, rangesInGPU->segmentModuleIndices, (nLowerModules + 1) * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(rangesInCPU->tripletModuleIndices, rangesInGPU->tripletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
+        rangesInCPU = new SDL::objectRangesBuffer<alpaka::DevCpu>(nModules, nLowerModules, devHost, queue);
+        rangesInCPU->setData(*rangesInCPU);
+
+        alpaka::memcpy(queue, rangesInCPU->hitRanges_buf, rangesBuffers->hitRanges_buf, 2 * nModules);
+        alpaka::memcpy(queue, rangesInCPU->quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules);
+        alpaka::memcpy(queue, rangesInCPU->miniDoubletModuleIndices_buf, rangesBuffers->miniDoubletModuleIndices_buf, nLowerModules + 1);
+        alpaka::memcpy(queue, rangesInCPU->segmentModuleIndices_buf, rangesBuffers->segmentModuleIndices_buf, nLowerModules + 1);
+        alpaka::memcpy(queue, rangesInCPU->tripletModuleIndices_buf, rangesBuffers->tripletModuleIndices_buf, nLowerModules);
+        alpaka::wait(queue);
     }
     return rangesInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 0a24a210..8a84a449 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -40,6 +40,7 @@ namespace SDL
         int dev;
         int nTotalSegments;
         struct objectRanges* rangesInGPU;
+        struct objectRangesBuffer<Acc>* rangesBuffers;
         struct hits* hitsInGPU;
         struct hitsBuffer<Acc>* hitsBuffers;
         struct miniDoublets* mdsInGPU;
@@ -52,7 +53,7 @@ namespace SDL
         struct pixelQuintuplets* pixelQuintupletsInGPU;
 
         //CPU interface stuff
-        objectRanges* rangesInCPU;
+        objectRangesBuffer<alpaka::DevCpu>* rangesInCPU;
         hitsBuffer<alpaka::DevCpu>* hitsInCPU;
         miniDoublets* mdsInCPU;
         segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
@@ -131,7 +132,7 @@ namespace SDL
         unsigned int getNumberOfExtendedTracks();
         unsigned int getNumberOfT3T3ExtendedTracks();
 
-        objectRanges* getRanges();
+        objectRangesBuffer<alpaka::DevCpu>* getRanges();
         hitsBuffer<alpaka::DevCpu>* getHits();
         hitsBuffer<alpaka::DevCpu>* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
diff --git a/SDL/Module.cu b/SDL/Module.cu
index e26eb899..01c0b162 100644
--- a/SDL/Module.cu
+++ b/SDL/Module.cu
@@ -7,70 +7,6 @@ std::map <unsigned int, float> *SDL::module_z;
 std::map <unsigned int, unsigned int> *SDL::module_type; // 23 : Ph2PSP, 24 : Ph2PSS, 25 : Ph2SS
 // https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29
 
-void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules)
-{
-    /* modules stucture object will be created in Event.cu*/
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    rangesInGPU.hitRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.hitRangesLower = (int*)cms::cuda::allocate_device(dev,nModules * sizeof(int),stream);
-    rangesInGPU.hitRangesUpper = (int*)cms::cuda::allocate_device(dev,nModules * sizeof(int),stream);
-    rangesInGPU.hitRangesnLower = (int8_t*)cms::cuda::allocate_device(dev,nModules * sizeof(int8_t),stream);
-    rangesInGPU.hitRangesnUpper = (int8_t*)cms::cuda::allocate_device(dev,nModules * sizeof(int8_t),stream);
-    rangesInGPU.mdRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.segmentRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.trackletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.tripletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.trackCandidateRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.quintupletRanges = (int*)cms::cuda::allocate_device(dev,nModules * 2 * sizeof(int),stream);
-    rangesInGPU.nEligibleT5Modules = (uint16_t*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream);
-
-    rangesInGPU.quintupletModuleIndices = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
-    rangesInGPU.quintupletModuleOccupancy = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
-    rangesInGPU.miniDoubletModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
-    rangesInGPU.miniDoubletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
-    rangesInGPU.segmentModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
-    rangesInGPU.segmentModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
-    rangesInGPU.tripletModuleIndices = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
-    rangesInGPU.tripletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
-
-    rangesInGPU.device_nTotalMDs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-    rangesInGPU.device_nTotalSegs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-    rangesInGPU.device_nTotalTrips = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-    rangesInGPU.device_nTotalQuints = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-
-#else
-    cudaMalloc(&rangesInGPU.hitRanges,nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.hitRangesLower,nModules  * sizeof(int));
-    cudaMalloc(&rangesInGPU.hitRangesUpper,nModules  * sizeof(int));
-    cudaMalloc(&rangesInGPU.hitRangesnLower,nModules  * sizeof(int8_t));
-    cudaMalloc(&rangesInGPU.hitRangesnUpper,nModules  * sizeof(int8_t));
-    cudaMalloc(&rangesInGPU.mdRanges,nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.segmentRanges,nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.trackletRanges,nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.tripletRanges,nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.trackCandidateRanges, nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.quintupletRanges, nModules * 2 * sizeof(int));
-    cudaMalloc(&rangesInGPU.nEligibleT5Modules, sizeof(uint16_t));
-    cudaMalloc(&rangesInGPU.quintupletModuleIndices, nLowerModules * sizeof(int));
-    cudaMalloc(&rangesInGPU.quintupletModuleOccupancy, nLowerModules * sizeof(int));
-
-    cudaMalloc(&rangesInGPU.miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&rangesInGPU.miniDoubletModuleOccupancy, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&rangesInGPU.segmentModuleIndices, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&rangesInGPU.segmentModuleOccupancy, (nLowerModules + 1) * sizeof(int));
-    cudaMalloc(&rangesInGPU.tripletModuleIndices, nLowerModules * sizeof(int));
-    cudaMalloc(&rangesInGPU.tripletModuleOccupancy, nLowerModules * sizeof(int));
-    
-    cudaMalloc(&rangesInGPU.device_nTotalMDs, sizeof(unsigned int));
-    cudaMalloc(&rangesInGPU.device_nTotalSegs, sizeof(unsigned int));
-    cudaMalloc(&rangesInGPU.device_nTotalTrips, sizeof(unsigned int));
-    cudaMalloc(&rangesInGPU.device_nTotalQuints, sizeof(unsigned int));
-
-#endif
-}
-
 void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream)
 {
     /* modules stucture object will be created in Event.cu*/
@@ -103,66 +39,6 @@ void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned in
     cudaStreamSynchronize(stream);
 }
 
-void SDL::objectRanges::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,hitRanges);
-    cms::cuda::free_device(dev,mdRanges);
-    cms::cuda::free_device(dev,segmentRanges);
-    cms::cuda::free_device(dev,trackletRanges);
-    cms::cuda::free_device(dev,tripletRanges);
-    cms::cuda::free_device(dev,trackCandidateRanges);
-    cms::cuda::free_device(dev,quintupletRanges);
-    cms::cuda::free_device(dev,nEligibleT5Modules);
-    cms::cuda::free_device(dev, indicesOfEligibleT5Modules);
-    cms::cuda::free_device(dev,quintupletModuleIndices);
-    cms::cuda::free_device(dev,quintupletModuleOccupancy);
-    cms::cuda::free_device(dev, hitRangesLower);
-    cms::cuda::free_device(dev, hitRangesUpper);
-    cms::cuda::free_device(dev, hitRangesnLower);
-    cms::cuda::free_device(dev, hitRangesnUpper);
-    cms::cuda::free_device(dev, miniDoubletModuleIndices);
-    cms::cuda::free_device(dev, miniDoubletModuleOccupancy);
-    cms::cuda::free_device(dev, segmentModuleIndices);
-    cms::cuda::free_device(dev, segmentModuleOccupancy);
-    cms::cuda::free_device(dev, tripletModuleIndices);
-    cms::cuda::free_device(dev, tripletModuleOccupancy);
-    cms::cuda::free_device(dev, device_nTotalMDs);
-    cms::cuda::free_device(dev, device_nTotalSegs);
-    cms::cuda::free_device(dev, device_nTotalTrips);
-    cms::cuda::free_device(dev, device_nTotalQuints);
-}
-
-void SDL::objectRanges::freeMemory()
-{
-    cudaFree(hitRanges);
-    cudaFree(hitRangesLower);
-    cudaFree(hitRangesUpper);
-    cudaFree(hitRangesnLower);
-    cudaFree(hitRangesnUpper);
-    cudaFree(mdRanges);
-    cudaFree(segmentRanges);
-    cudaFree(trackletRanges);
-    cudaFree(tripletRanges);
-    cudaFree(trackCandidateRanges);
-    cudaFree(quintupletRanges);
-    cudaFree(nEligibleT5Modules);
-    cudaFree(indicesOfEligibleT5Modules);
-    cudaFree(quintupletModuleIndices);
-    cudaFree(quintupletModuleOccupancy);
-    cudaFree(miniDoubletModuleIndices);
-    cudaFree(miniDoubletModuleOccupancy);
-    cudaFree(segmentModuleIndices);
-    cudaFree(segmentModuleOccupancy);
-    cudaFree(tripletModuleIndices);
-    cudaFree(tripletModuleOccupancy);
-    cudaFree(device_nTotalMDs);
-    cudaFree(device_nTotalSegs);
-    cudaFree(device_nTotalTrips);
-    cudaFree(device_nTotalQuints);
-}
-
 void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping)
 {
     cudaFree(modulesInGPU.detIds);
@@ -716,19 +592,3 @@ unsigned int SDL::modules::parsePartnerModuleId(unsigned int detId, bool isLower
 {
     return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1);
 }
-
-void SDL::resetObjectRanges(struct objectRanges& rangesInGPU, unsigned int nModules,cudaStream_t stream)
-{
-    cudaMemsetAsync(rangesInGPU.hitRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.hitRangesLower, -1,nModules*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.hitRangesUpper, -1,nModules*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.hitRangesnLower, -1,nModules*sizeof(int8_t),stream);
-    cudaMemsetAsync(rangesInGPU.hitRangesnUpper, -1,nModules*sizeof(int8_t),stream);
-    cudaMemsetAsync(rangesInGPU.mdRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.segmentRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.trackletRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.tripletRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.trackCandidateRanges, -1,nModules*2*sizeof(int),stream);
-    cudaMemsetAsync(rangesInGPU.quintupletRanges, -1, nModules*2*sizeof(int),stream);
-    cudaStreamSynchronize(stream);
-}
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index 6e48abaf..d4e1457f 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -73,9 +73,118 @@ namespace SDL
         unsigned int *device_nTotalSegs;
         unsigned int *device_nTotalTrips;
         unsigned int *device_nTotalQuints;
-    
-        void freeMemoryCache();
-        void freeMemory();
+
+        template<typename TBuff>
+        void setData(TBuff& objectRangesbuf)
+        {
+            hitRanges = alpaka::getPtrNative(objectRangesbuf.hitRanges_buf);
+            hitRangesLower = alpaka::getPtrNative(objectRangesbuf.hitRangesLower_buf);
+            hitRangesUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesUpper_buf);
+            hitRangesnLower = alpaka::getPtrNative(objectRangesbuf.hitRangesnLower_buf);
+            hitRangesnUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesnUpper_buf);
+            mdRanges = alpaka::getPtrNative(objectRangesbuf.mdRanges_buf);
+            segmentRanges = alpaka::getPtrNative(objectRangesbuf.segmentRanges_buf);
+            trackletRanges = alpaka::getPtrNative(objectRangesbuf.trackletRanges_buf);
+            tripletRanges = alpaka::getPtrNative(objectRangesbuf.tripletRanges_buf);
+            trackCandidateRanges = alpaka::getPtrNative(objectRangesbuf.trackCandidateRanges_buf);
+            quintupletRanges = alpaka::getPtrNative(objectRangesbuf.quintupletRanges_buf);
+
+            nEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.nEligibleT5Modules_buf);
+            indicesOfEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.indicesOfEligibleT5Modules_buf);
+
+            quintupletModuleIndices = alpaka::getPtrNative(objectRangesbuf.quintupletModuleIndices_buf);
+            quintupletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.quintupletModuleOccupancy_buf);
+            miniDoubletModuleIndices = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleIndices_buf);
+            miniDoubletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleOccupancy_buf);
+            segmentModuleIndices = alpaka::getPtrNative(objectRangesbuf.segmentModuleIndices_buf);
+            segmentModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.segmentModuleOccupancy_buf);
+            tripletModuleIndices = alpaka::getPtrNative(objectRangesbuf.tripletModuleIndices_buf);
+            tripletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.tripletModuleOccupancy_buf);
+
+            device_nTotalMDs = alpaka::getPtrNative(objectRangesbuf.device_nTotalMDs_buf);
+            device_nTotalSegs = alpaka::getPtrNative(objectRangesbuf.device_nTotalSegs_buf);
+            device_nTotalTrips = alpaka::getPtrNative(objectRangesbuf.device_nTotalTrips_buf);
+            device_nTotalQuints = alpaka::getPtrNative(objectRangesbuf.device_nTotalQuints_buf);
+        }
+    };
+
+    template<typename TAcc>
+    struct objectRangesBuffer : objectRanges
+    {
+        Buf<TAcc, int> hitRanges_buf;
+        Buf<TAcc, int> hitRangesLower_buf;
+        Buf<TAcc, int> hitRangesUpper_buf;
+        Buf<TAcc, int8_t> hitRangesnLower_buf;
+        Buf<TAcc, int8_t> hitRangesnUpper_buf;
+        Buf<TAcc, int> mdRanges_buf;
+        Buf<TAcc, int> segmentRanges_buf;
+        Buf<TAcc, int> trackletRanges_buf;
+        Buf<TAcc, int> tripletRanges_buf;
+        Buf<TAcc, int> trackCandidateRanges_buf;
+        Buf<TAcc, int> quintupletRanges_buf;
+
+        Buf<TAcc, uint16_t> nEligibleT5Modules_buf;
+        Buf<TAcc, uint16_t> indicesOfEligibleT5Modules_buf;
+
+        Buf<TAcc, int> quintupletModuleIndices_buf;
+        Buf<TAcc, int> quintupletModuleOccupancy_buf;
+        Buf<TAcc, int> miniDoubletModuleIndices_buf;
+        Buf<TAcc, int> miniDoubletModuleOccupancy_buf;
+        Buf<TAcc, int> segmentModuleIndices_buf;
+        Buf<TAcc, int> segmentModuleOccupancy_buf;
+        Buf<TAcc, int> tripletModuleIndices_buf;
+        Buf<TAcc, int> tripletModuleOccupancy_buf;
+
+        Buf<TAcc, unsigned int> device_nTotalMDs_buf;
+        Buf<TAcc, unsigned int> device_nTotalSegs_buf;
+        Buf<TAcc, unsigned int> device_nTotalTrips_buf;
+        Buf<TAcc, unsigned int> device_nTotalQuints_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        objectRangesBuffer(unsigned int nModules,
+                           unsigned int nLowerModules,
+                           TDevAcc const & devAccIn,
+                           TQueue& queue) :
+            hitRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nModules)),
+            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nModules)),
+            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
+            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
+            mdRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            segmentRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            trackletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            tripletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            trackCandidateRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            quintupletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            nEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, 1)),
+            indicesOfEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, nLowerModules)),
+            quintupletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            quintupletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            miniDoubletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            miniDoubletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            segmentModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            segmentModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            tripletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            tripletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            device_nTotalMDs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            device_nTotalSegs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            device_nTotalTrips_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            device_nTotalQuints_buf(allocBufWrapper<unsigned int>(devAccIn, 1))
+        {
+            alpaka::memset(queue, hitRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, hitRangesLower_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesUpper_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesnLower_buf, -1, nModules);
+            alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules);
+            alpaka::memset(queue, mdRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, segmentRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, trackletRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, tripletRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, trackCandidateRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, quintupletRanges_buf, -1, nModules*2);
+            alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerModules);
+            alpaka::wait(queue);
+        }
     };
 
     struct modules
@@ -148,7 +257,6 @@ namespace SDL
     void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
     void fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules);
     void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r);
-    void resetObjectRanges(struct objectRanges& rangesInGPU, unsigned int nModules,cudaStream_t stream);
     void createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules);
 }
 #endif
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 3ab470fd..83e7f51d 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -30,7 +30,7 @@ std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pL
 {
     SDL::segmentsBuffer<alpaka::DevCpu>& segments_ = *(event->getSegments());
     SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     unsigned int MD_1 = segments_.mdIndices[2 * (pLS + pLS_offset)];
@@ -242,7 +242,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3)
 {
     SDL::pixelTriplets& pixelTriplets_ = *(event->getPixelTriplets());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     return pixelTriplets_.pixelSegmentIndices[pT3] - pLS_offset;
@@ -342,7 +342,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5)
 {
     SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     return pixelQuintuplets_.pixelIndices[pT5] - pLS_offset;
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 43d7831f..15a8eb39 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -392,7 +392,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
 void setQuintupletOutputBranches(SDL::Event* event)
 {
     SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const float kRinv1GeVf = (2.99792458e-3 * 3.8);
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
@@ -563,7 +563,7 @@ void setGnnNtupleBranches(SDL::Event* event)
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
 
     std::set<unsigned int> mds_used_in_sg;
@@ -1107,7 +1107,7 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f
 void printHitMultiplicities(SDL::Event* event)
 {
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     int nHits = 0;
     for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); idx++) // "<=" because cheating to include pixel track candidate lower module
@@ -1153,7 +1153,7 @@ void printMDs(SDL::Event* event)
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     // Then obtain the lower module index
     for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx)
@@ -1177,7 +1177,7 @@ void printLSs(SDL::Event* event)
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     int nSegments = 0;
     for (unsigned int i = 0; i <  *(modulesInGPU.nLowerModules); ++i)
@@ -1210,7 +1210,7 @@ void printpLSs(SDL::Event* event)
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     unsigned int i = *(modulesInGPU.nLowerModules);
     unsigned int idx = i;//modulesInGPU.lowerModuleIndices[i];
@@ -1283,7 +1283,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::modules& modulesInGPU = (*event->getModules());
-    SDL::objectRanges& rangesInGPU = (*event->getRanges());
+    SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     //int nTrackCandidates = 0;
     for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx)
     {

From a3f89d11353c4ee2fb3468bf6fe5499d34616f4f Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 7 Jun 2023 19:15:26 -0400
Subject: [PATCH 18/44] remove debug

---
 SDL/Event.cu | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 3d7d0805..e6b4dd52 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -469,19 +469,6 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
         rangesInGPU->setData(*rangesBuffers);
     }
 
-    unsigned int hostValue;
-
-    // Copy from device to host
-    cudaError_t err = cudaMemcpy(&hostValue, &rangesInGPU->hitRangesnUpper[0], sizeof(int8_t), cudaMemcpyDeviceToHost);
-
-    // Check for errors
-    if (err != cudaSuccess) {
-        printf("cudaMemcpy failed with error: %s\n", cudaGetErrorString(err));
-    } else {
-        // Print the value
-        printf("The value is: %u\n", hostValue);
-    }
-
     // Copy the host arrays to the GPU.
     alpaka::memcpy(queue, hitsBuffers->xs_buf, x, nHits);
     alpaka::memcpy(queue, hitsBuffers->ys_buf, y, nHits);

From b6324bcdc1fe78db99d7a86fadf626aae3ee4287 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 7 Jun 2023 23:16:23 -0400
Subject: [PATCH 19/44] Move triplets to Alpaka memory

---
 SDL/Event.cu                  | 130 +++++++------------------
 SDL/Event.cuh                 |   5 +-
 SDL/Hit.cuh                   |   6 +-
 SDL/MiniDoublet.cuh           |   1 -
 SDL/Quintuplet.cuh            |   1 -
 SDL/Segment.cuh               |   1 -
 SDL/Triplet.cu                | 176 ----------------------------------
 SDL/Triplet.cuh               | 123 +++++++++++++++++++++---
 code/core/AccessHelper.cc     |   2 +-
 code/core/write_sdl_ntuple.cc |  12 +--
 10 files changed, 162 insertions(+), 295 deletions(-)
 delete mode 100644 SDL/Triplet.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index e6b4dd52..8139af57 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -11,7 +11,6 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
     int driver;
     cudaRuntimeGetVersion(&version);
     cudaDriverGetVersion(&driver);
-    //printf("version: %d Driver %d\n",version, driver);
     stream = estream;
     addObjects = verbose;
     hitsInGPU = nullptr;
@@ -61,14 +60,12 @@ SDL::Event::~Event()
 {
 #ifdef CACHE_ALLOC
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
-    if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
-    if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
@@ -77,7 +74,7 @@ SDL::Event::~Event()
     if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
     if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
     if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
-    if(tripletsInGPU!= nullptr){cms::cuda::free_host(tripletsInGPU);}
+    if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;}
     if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);}
     if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
     if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
@@ -109,30 +106,6 @@ SDL::Event::~Event()
 
     if(tripletsInCPU != nullptr)
     {
-        delete[] tripletsInCPU->segmentIndices;
-        delete[] tripletsInCPU->nTriplets;
-        delete[] tripletsInCPU->totOccupancyTriplets;
-        delete[] tripletsInCPU->betaIn;
-        delete[] tripletsInCPU->betaOut;
-        delete[] tripletsInCPU->pt_beta;
-        delete[] tripletsInCPU->hitIndices;
-        delete[] tripletsInCPU->logicalLayers;
-        delete[] tripletsInCPU->lowerModuleIndices;
-        delete tripletsInCPU->nMemoryLocations;
-#ifdef CUT_VALUE_DEBUG
-        delete[] tripletsInCPU->zOut;
-        delete[] tripletsInCPU->zLo;
-        delete[] tripletsInCPU->zHi;
-        delete[] tripletsInCPU->zLoPointed;
-        delete[] tripletsInCPU->zHiPointed;
-        delete[] tripletsInCPU->sdlCut;
-        delete[] tripletsInCPU->betaInCut;
-        delete[] tripletsInCPU->betaOutCut;
-        delete[] tripletsInCPU->deltaBetaCut;
-        delete[] tripletsInCPU->rtLo;
-        delete[] tripletsInCPU->rtHi;
-        delete[] tripletsInCPU->kZ;
-#endif
         delete tripletsInCPU;
     }
     if(quintupletsInCPU != nullptr)
@@ -237,14 +210,12 @@ void SDL::Event::resetEvent()
 #ifdef CACHE_ALLOC
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
-    if(tripletsInGPU){tripletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
-    if(tripletsInGPU){tripletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);}
@@ -276,7 +247,7 @@ void SDL::Event::resetEvent()
       rangesInGPU = nullptr;}
     if(segmentsInGPU){delete segmentsInGPU; delete segmentsBuffers;
       segmentsInGPU = nullptr;}
-    if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU);
+    if(tripletsInGPU){delete tripletsInGPU; delete tripletsBuffers;
       tripletsInGPU = nullptr;}
     if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU);
       quintupletsInGPU = nullptr;}
@@ -312,15 +283,6 @@ void SDL::Event::resetEvent()
     }
     if(tripletsInCPU != nullptr)
     {
-        delete[] tripletsInCPU->segmentIndices;
-        delete[] tripletsInCPU->nTriplets;
-        delete[] tripletsInCPU->totOccupancyTriplets;
-        delete[] tripletsInCPU->betaIn;
-        delete[] tripletsInCPU->betaOut;
-        delete[] tripletsInCPU->pt_beta;
-        delete[] tripletsInCPU->logicalLayers;
-        delete[] tripletsInCPU->lowerModuleIndices;
-        delete[] tripletsInCPU->hitIndices;
         delete tripletsInCPU;
         tripletsInCPU = nullptr;
     }
@@ -865,7 +827,6 @@ void SDL::Event::createTriplets()
 {
     if(tripletsInGPU == nullptr)
     {
-        tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host(sizeof(SDL::triplets), stream);
         unsigned int maxTriplets;
 
         Vec const threadsPerBlockCreateTrip(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
@@ -886,11 +847,14 @@ void SDL::Event::createTriplets()
         cudaMemcpyAsync(&maxTriplets,rangesInGPU->device_nTotalTrips,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
         cudaStreamSynchronize(stream);
 
-        createTripletsInExplicitMemory(*tripletsInGPU, maxTriplets, nLowerModules,stream);
+        tripletsInGPU = new SDL::triplets();
+        tripletsBuffers = new SDL::tripletsBuffer<Acc>(maxTriplets, nLowerModules, devAcc, queue);
+        tripletsInGPU->setData(*tripletsBuffers);
 
         cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
         cudaStreamSynchronize(stream);
     }
+
     //TODO:Move this also inside the ranges function
     uint16_t nonZeroModules=0;
     unsigned int max_InnerSeg=0;
@@ -1909,63 +1873,43 @@ SDL::segmentsBuffer<alpaka::DevCpu>* SDL::Event::getSegments()
     return segmentsInCPU;
 }
 
-SDL::triplets* SDL::Event::getTriplets()
+SDL::tripletsBuffer<alpaka::DevCpu>* SDL::Event::getTriplets()
 {
     if(tripletsInCPU == nullptr)
     {
-        tripletsInCPU = new SDL::triplets;
-        tripletsInCPU->nMemoryLocations = new unsigned int;
-        cudaMemcpyAsync(tripletsInCPU->nMemoryLocations, tripletsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
+        // Get nMemoryLocations parameter to initilize host based tripletsInCPU
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nMemLocal_buf, tripletsBuffers->nMemoryLocations_buf, 1);
+        alpaka::wait(queue);
+
+        unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
+        tripletsInCPU = new SDL::tripletsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, devHost, queue);
+        tripletsInCPU->setData(*tripletsInCPU);
 
-        tripletsInCPU->segmentIndices = new unsigned[2 * *(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->nTriplets = new int[nLowerModules];
-        tripletsInCPU->betaIn  = new FPX[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->betaOut = new FPX[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->pt_beta = new FPX[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->hitIndices = new unsigned int[6 * *(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->logicalLayers = new uint8_t[3 * *(tripletsInCPU->nMemoryLocations)];
+        *alpaka::getPtrNative(tripletsInCPU->nMemoryLocations_buf) = nMemLocal;
 #ifdef CUT_VALUE_DEBUG
-        tripletsInCPU->zOut = new float[4 * *(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->zLo = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->zHi = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->zLoPointed = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->zHiPointed = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->sdlCut = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->betaInCut = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->betaOutCut = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->deltaBetaCut = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->rtLo = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->rtHi = new float[*(tripletsInCPU->nMemoryLocations)];
-        tripletsInCPU->kZ = new float[*(tripletsInCPU->nMemoryLocations)];
-
-        tripletsInCPU->rtOut = tripletsInCPU->zOut + *(tripletsInCPU->nMemoryLocations);
-        tripletsInCPU->deltaPhiPos = tripletsInCPU->zOut + 2 * *(tripletsInCPU->nMemoryLocations);
-        tripletsInCPU->deltaPhi = tripletsInCPU->zOut + 3 * *(tripletsInCPU->nMemoryLocations);
-
-        cudaMemcpyAsync(tripletsInCPU->zOut, tripletsInGPU->zOut, 4 * * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->zLo, tripletsInGPU->zLo, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->zHi, tripletsInGPU->zHi, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->zLoPointed, tripletsInGPU->zLoPointed, 4 * * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->zHiPointed, tripletsInGPU->zHiPointed, * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->sdlCut, tripletsInGPU->sdlCut, *(tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->betaInCut, tripletsInGPU->betaInCut,  * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->betaOutCut, tripletsInGPU->betaOutCut,  * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->deltaBetaCut, tripletsInGPU->deltaBetaCut, *(tripletsInCPU->nMemoryLocations)*sizeof(unsigned int), cudaMemcpyDeviceToHost);
-        cudaMemcpyAsync(tripletsInCPU->rtLo, tripletsInGPU->rtLo,  * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->rtHi, tripletsInGPU->rtHi,  * (tripletsInCPU->nMemoryLocations)* sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->kZ, tripletsInGPU->kZ,  * (tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
+        alpaka::memcpy(queue, tripletsInCPU->zOut_buf, tripletsBuffers->zOut_buf, 4 * nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->zLo_buf, tripletsBuffers->zLo_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->zHi_buf, tripletsBuffers->zHi_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->zLoPointed_buf, tripletsBuffers->zLoPointed_buf, 4 * nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->zHiPointed_buf, tripletsBuffers->zHiPointed_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->sdlCut_buf, tripletsBuffers->sdlCut_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->betaInCut_buf, tripletsBuffers->betaInCut_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->betaOutCut_buf, tripletsBuffers->betaOutCut_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->deltaBetaCut_buf, tripletsBuffers->deltaBetaCut_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->rtLo_buf, tripletsBuffers->rtLo_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->rtHi_buf, tripletsBuffers->rtHi_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->kZ_buf, tripletsBuffers->kZ_buf, nMemLocal);
 #endif
-        cudaMemcpyAsync(tripletsInCPU->hitIndices, tripletsInGPU->hitIndices, 6 * *(tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->logicalLayers, tripletsInGPU->logicalLayers, 3 * *(tripletsInCPU->nMemoryLocations) * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(tripletsInCPU->segmentIndices, tripletsInGPU->segmentIndices, 2 * *(tripletsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(tripletsInCPU->betaIn, tripletsInGPU->betaIn,   *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(tripletsInCPU->betaOut, tripletsInGPU->betaOut, *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(tripletsInCPU->pt_beta, tripletsInGPU->pt_beta, *(tripletsInCPU->nMemoryLocations) * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        tripletsInCPU->totOccupancyTriplets = new int[nLowerModules];
-        cudaMemcpyAsync(tripletsInCPU->nTriplets, tripletsInGPU->nTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(tripletsInCPU->totOccupancyTriplets, tripletsInGPU->totOccupancyTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        alpaka::memcpy(queue, tripletsInCPU->hitIndices_buf, tripletsBuffers->hitIndices_buf, 6 * nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->logicalLayers_buf, tripletsBuffers->logicalLayers_buf, 3 * nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->segmentIndices_buf, tripletsBuffers->segmentIndices_buf, 2 * nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->betaIn_buf, tripletsBuffers->betaIn_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->betaOut_buf, tripletsBuffers->betaOut_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->pt_beta_buf, tripletsBuffers->pt_beta_buf, nMemLocal);
+        alpaka::memcpy(queue, tripletsInCPU->nTriplets_buf, tripletsBuffers->nTriplets_buf, nLowerModules);
+        alpaka::memcpy(queue, tripletsInCPU->totOccupancyTriplets_buf, tripletsBuffers->totOccupancyTriplets_buf, nLowerModules);
+        alpaka::wait(queue);
     }
     return tripletsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 8a84a449..8d694888 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -47,6 +47,7 @@ namespace SDL
         struct segments* segmentsInGPU;
         struct segmentsBuffer<Acc>* segmentsBuffers;
         struct triplets* tripletsInGPU;
+        struct tripletsBuffer<Acc>* tripletsBuffers;
         struct quintuplets* quintupletsInGPU;
         struct trackCandidates* trackCandidatesInGPU;
         struct pixelTriplets* pixelTripletsInGPU;
@@ -57,7 +58,7 @@ namespace SDL
         hitsBuffer<alpaka::DevCpu>* hitsInCPU;
         miniDoublets* mdsInCPU;
         segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
-        triplets* tripletsInCPU;
+        tripletsBuffer<alpaka::DevCpu>* tripletsInCPU;
         trackCandidates* trackCandidatesInCPU;
         modules* modulesInCPU;
         modules* modulesInCPUFull;
@@ -137,7 +138,7 @@ namespace SDL
         hitsBuffer<alpaka::DevCpu>* getHitsInCMSSW();
         miniDoublets* getMiniDoublets();
         segmentsBuffer<alpaka::DevCpu>* getSegments() ;
-        triplets* getTriplets();
+        tripletsBuffer<alpaka::DevCpu>* getTriplets();
         quintuplets* getQuintuplets();
         trackCandidates* getTrackCandidates();
         trackCandidates* getTrackCandidatesInCMSSW();
diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index 61a26cbd..8e69bc96 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -78,9 +78,9 @@ namespace SDL
 
         template<typename TQueue, typename TDevAcc>
         hitsBuffer(unsigned int nModules,
-                    unsigned int nMaxHits,
-                    TDevAcc const & devAccIn,
-                    TQueue& queue) :
+                   unsigned int nMaxHits,
+                   TDevAcc const & devAccIn,
+                   TQueue& queue) :
             nHits_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
             xs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
             ys_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 850f01d8..9f723e2d 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -3,7 +3,6 @@
 
 #include "Constants.cuh"
 #include "EndcapGeometry.cuh"
-#include "TiltedGeometry.h"
 #include "Module.cuh"
 #include "Hit.cuh"
 
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index dc7893a0..0f408a30 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -3,7 +3,6 @@
 
 #include "Constants.cuh"
 #include "EndcapGeometry.cuh"
-#include "TiltedGeometry.h"
 #include "Segment.cuh"
 #include "MiniDoublet.cuh"
 #include "Module.cuh"
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index 88f5718f..d6308cb0 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -3,7 +3,6 @@
 
 #include "Constants.cuh"
 #include "EndcapGeometry.cuh"
-#include "TiltedGeometry.h"
 #include "MiniDoublet.cuh"
 #include "Module.cuh"
 #include "Hit.cuh"
diff --git a/SDL/Triplet.cu b/SDL/Triplet.cu
deleted file mode 100644
index 218880e2..00000000
--- a/SDL/Triplet.cu
+++ /dev/null
@@ -1,176 +0,0 @@
-#include "Triplet.cuh"
-
-void SDL::createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules, cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    tripletsInGPU.segmentIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(unsigned int) *2,stream);
-    tripletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(uint16_t) *3,stream);
-    tripletsInGPU.betaIn = (FPX*)cms::cuda::allocate_device(dev,maxTriplets * sizeof(FPX) *3,stream);
-    tripletsInGPU.nTriplets = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
-    tripletsInGPU.totOccupancyTriplets = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
-    tripletsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream);
-    tripletsInGPU.partOfPT3 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream);
-    tripletsInGPU.partOfT5 = (bool*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(bool), stream);
-
-    tripletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, maxTriplets * 3 * sizeof(uint8_t), stream);
-    tripletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, maxTriplets * 6 * sizeof(unsigned int), stream);
-    tripletsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-
-#ifdef CUT_VALUE_DEBUG
-    tripletsInGPU.zOut = (float*)cms::cuda::allocate_device(dev, maxTriplets * 4 * sizeof(float), stream);
-    tripletsInGPU.zLo = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.zHi = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.zLoPointed = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.zHiPointed = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.sdlCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.betaInCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.betaOutCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.deltaBetaCut = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.rtLo = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.rtHi = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.kZ = (float*)cms::cuda::allocate_device(dev, maxTriplets * sizeof(float), stream);
-    tripletsInGPU.rtOut = tripletsInGPU.zOut + maxTriplets;
-    tripletsInGPU.deltaPhiPos = tripletsInGPU.zOut + maxTriplets *2;
-    tripletsInGPU.deltaPhi = tripletsInGPU.zOut + maxTriplets *3;
-#endif
-
-#else
-    cudaMalloc(&tripletsInGPU.segmentIndices, /*5*/2 * maxTriplets * sizeof(unsigned int));
-    cudaMalloc(&tripletsInGPU.lowerModuleIndices, 3 * maxTriplets * sizeof(uint16_t));
-    cudaMalloc(&tripletsInGPU.betaIn, maxTriplets * 3 * sizeof(FPX));
-    cudaMalloc(&tripletsInGPU.nTriplets, nLowerModules * sizeof(int));
-    cudaMalloc(&tripletsInGPU.totOccupancyTriplets, nLowerModules * sizeof(int));
-    cudaMalloc(&tripletsInGPU.partOfPT5, maxTriplets * sizeof(bool));
-    cudaMalloc(&tripletsInGPU.partOfPT3, maxTriplets * sizeof(bool));
-    cudaMalloc(&tripletsInGPU.partOfT5, maxTriplets * sizeof(bool));
-
-    cudaMalloc(&tripletsInGPU.logicalLayers, maxTriplets * 3 * sizeof(uint8_t));
-    cudaMalloc(&tripletsInGPU.hitIndices, maxTriplets * 6 * sizeof(unsigned int));
-    cudaMalloc(&tripletsInGPU.nMemoryLocations, sizeof(unsigned int));
-
-#ifdef CUT_VALUE_DEBUG
-    cudaMalloc(&tripletsInGPU.zOut, maxTriplets * 4*sizeof(unsigned int));
-    cudaMalloc(&tripletsInGPU.zLo, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.zHi, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.zLoPointed, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.zHiPointed, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.sdlCut, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.betaInCut, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.betaOutCut, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.deltaBetaCut, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.rtLo, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.rtHi, maxTriplets * sizeof(float));
-    cudaMalloc(&tripletsInGPU.kZ, maxTriplets * sizeof(float));
-
-    tripletsInGPU.rtOut = tripletsInGPU.zOut + maxTriplets;
-    tripletsInGPU.deltaPhiPos = tripletsInGPU.zOut + maxTriplets *2;
-    tripletsInGPU.deltaPhi = tripletsInGPU.zOut + maxTriplets *3;
-#endif
-
-#endif
-    cudaMemsetAsync(tripletsInGPU.nTriplets,0,nLowerModules * sizeof(int),stream);
-    cudaMemsetAsync(tripletsInGPU.totOccupancyTriplets,0,nLowerModules * sizeof(int),stream);
-    cudaMemsetAsync(tripletsInGPU.partOfPT5,0,maxTriplets * sizeof(bool),stream);
-    cudaMemsetAsync(tripletsInGPU.partOfPT3,0,maxTriplets * sizeof(bool),stream);
-    cudaMemsetAsync(tripletsInGPU.partOfT5,0,maxTriplets * sizeof(bool),stream);
-    
-    cudaStreamSynchronize(stream);
-
-    tripletsInGPU.betaOut = tripletsInGPU.betaIn + maxTriplets;
-    tripletsInGPU.pt_beta = tripletsInGPU.betaIn + maxTriplets * 2;
-}
-
-SDL::triplets::triplets()
-{
-    segmentIndices = nullptr;
-    lowerModuleIndices = nullptr;
-    betaIn = nullptr;
-    betaOut = nullptr;
-    pt_beta = nullptr;
-    logicalLayers = nullptr;
-    hitIndices = nullptr;
-#ifdef CUT_VALUE_DEBUG
-    zOut = nullptr;
-    rtOut = nullptr;
-    deltaPhiPos = nullptr;
-    deltaPhi = nullptr;
-    zLo = nullptr;
-    zHi = nullptr;
-    rtLo = nullptr;
-    rtHi = nullptr;
-    zLoPointed = nullptr;
-    zHiPointed = nullptr;
-    kZ = nullptr;
-    betaInCut = nullptr;
-    betaOutCut = nullptr;
-    deltaBetaCut = nullptr;
-    sdlCut = nullptr;
-#endif
-}
-
-SDL::triplets::~triplets()
-{
-}
-
-void SDL::triplets::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,segmentIndices);
-    cms::cuda::free_device(dev,lowerModuleIndices);
-    cms::cuda::free_device(dev,betaIn);
-    cms::cuda::free_device(dev,nTriplets);
-    cms::cuda::free_device(dev,totOccupancyTriplets);
-    cms::cuda::free_device(dev, partOfPT5);
-    cms::cuda::free_device(dev, partOfPT3);
-    cms::cuda::free_device(dev, partOfT5);
-    cms::cuda::free_device(dev, logicalLayers);
-    cms::cuda::free_device(dev, hitIndices);
-    cms::cuda::free_device(dev, nMemoryLocations);
-#ifdef CUT_VALUE_DEBUG
-    cms::cuda::free_device(dev, zOut);
-    cms::cuda::free_device(dev, zLo);
-    cms::cuda::free_device(dev, zHi);
-    cms::cuda::free_device(dev, zLoPointed);
-    cms::cuda::free_device(dev, zHiPointed);
-    cms::cuda::free_device(dev, sdlCut);
-    cms::cuda::free_device(dev, betaInCut);
-    cms::cuda::free_device(dev, betaOutCut);
-    cms::cuda::free_device(dev, deltaBetaCut);
-    cms::cuda::free_device(dev, rtLo);
-    cms::cuda::free_device(dev, rtHi);
-    cms::cuda::free_device(dev, kZ);
-#endif
-}
-
-void SDL::triplets::freeMemory(cudaStream_t stream)
-{
-    cudaFree(segmentIndices);
-    cudaFree(lowerModuleIndices);
-    cudaFree(nTriplets);
-    cudaFree(totOccupancyTriplets);
-    cudaFree(betaIn);
-    cudaFree(partOfPT5);
-    cudaFree(partOfPT3);
-    cudaFree(partOfT5);
-    cudaFree(logicalLayers);
-    cudaFree(hitIndices);
-    cudaFree(nMemoryLocations);
-#ifdef CUT_VALUE_DEBUG
-    cudaFree(zOut);
-    cudaFree(zLo);
-    cudaFree(zHi);
-    cudaFree(rtLo);
-    cudaFree(rtHi);
-    cudaFree(zLoPointed);
-    cudaFree(zHiPointed);
-    cudaFree(kZ);
-    cudaFree(betaInCut);
-    cudaFree(betaOutCut);
-    cudaFree(deltaBetaCut);
-    cudaFree(sdlCut);
-#endif
-    cudaStreamSynchronize(stream);
-}
\ No newline at end of file
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 8a9fc96f..d5b5a81b 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -15,17 +15,13 @@ namespace SDL
         uint16_t* lowerModuleIndices; //3 of them now
         int* nTriplets;
         int* totOccupancyTriplets;
-
         unsigned int* nMemoryLocations;
-
         uint8_t* logicalLayers;
         unsigned int* hitIndices;
-        
         //delta beta = betaIn - betaOut
         FPX* betaIn;
         FPX* betaOut;
         FPX* pt_beta;
-
         bool* partOfPT5;
         bool* partOfT5;
         bool* partOfPT3;
@@ -34,10 +30,8 @@ namespace SDL
         //debug variables
         float* zOut;
         float* rtOut;
-
         float* deltaPhiPos;
         float* deltaPhi;
-
         float* zLo;
         float* zHi;
         float* zLoPointed;
@@ -50,13 +44,120 @@ namespace SDL
         float* rtHi;
         float* kZ;
 #endif
-        triplets();
-        ~triplets();
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
+        template<typename TBuff>
+        void setData(TBuff& tripletsbuf)
+        {
+            segmentIndices = alpaka::getPtrNative(tripletsbuf.segmentIndices_buf);
+            lowerModuleIndices = alpaka::getPtrNative(tripletsbuf.lowerModuleIndices_buf);
+            nTriplets = alpaka::getPtrNative(tripletsbuf.nTriplets_buf);
+            totOccupancyTriplets = alpaka::getPtrNative(tripletsbuf.totOccupancyTriplets_buf);
+            nMemoryLocations = alpaka::getPtrNative(tripletsbuf.nMemoryLocations_buf);
+            logicalLayers = alpaka::getPtrNative(tripletsbuf.logicalLayers_buf);
+            hitIndices = alpaka::getPtrNative(tripletsbuf.hitIndices_buf);
+            betaIn = alpaka::getPtrNative(tripletsbuf.betaIn_buf);
+            betaOut = alpaka::getPtrNative(tripletsbuf.betaOut_buf);
+            pt_beta = alpaka::getPtrNative(tripletsbuf.pt_beta_buf);
+            partOfPT5 = alpaka::getPtrNative(tripletsbuf.partOfPT5_buf);
+            partOfT5 = alpaka::getPtrNative(tripletsbuf.partOfT5_buf);
+            partOfPT3 = alpaka::getPtrNative(tripletsbuf.partOfPT3_buf);
+#ifdef CUT_VALUE_DEBUG
+            zOut = alpaka::getPtrNative(tripletsbuf.zOut_buf);
+            rtOut = alpaka::getPtrNative(tripletsbuf.rtOut_buf);
+            deltaPhiPos = alpaka::getPtrNative(tripletsbuf.deltaPhiPos_buf);
+            deltaPhi = alpaka::getPtrNative(tripletsbuf.deltaPhi_buf);
+            zLo = alpaka::getPtrNative(tripletsbuf.zLo_buf);
+            zHi = alpaka::getPtrNative(tripletsbuf.zHi_buf);
+            zLoPointed = alpaka::getPtrNative(tripletsbuf.zLoPointed_buf);
+            zHiPointed = alpaka::getPtrNative(tripletsbuf.zHiPointed_buf);
+            sdlCut = alpaka::getPtrNative(tripletsbuf.sdlCut_buf);
+            betaInCut = alpaka::getPtrNative(tripletsbuf.betaInCut_buf);
+            betaOutCut = alpaka::getPtrNative(tripletsbuf.betaOutCut_buf);
+            deltaBetaCut = alpaka::getPtrNative(tripletsbuf.deltaBetaCut_buf);
+            rtLo = alpaka::getPtrNative(tripletsbuf.rtLo_buf);
+            rtHi = alpaka::getPtrNative(tripletsbuf.rtHi_buf);
+            kZ = alpaka::getPtrNative(tripletsbuf.kZ_buf);
+#endif
+        }
     };
 
-    void createTripletsInExplicitMemory(struct triplets& tripletsInGPU, unsigned int maxTriplets, uint16_t nLowerModules,cudaStream_t stream);
+    template<typename TAcc>
+    struct tripletsBuffer : triplets
+    {
+        Buf<TAcc, unsigned int> segmentIndices_buf;
+        Buf<TAcc, uint16_t> lowerModuleIndices_buf;
+        Buf<TAcc, int> nTriplets_buf;
+        Buf<TAcc, int> totOccupancyTriplets_buf;
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
+        Buf<TAcc, uint8_t> logicalLayers_buf;
+        Buf<TAcc, unsigned int> hitIndices_buf;
+        Buf<TAcc, FPX> betaIn_buf;
+        Buf<TAcc, FPX> betaOut_buf;
+        Buf<TAcc, FPX> pt_beta_buf;
+        Buf<TAcc, bool> partOfPT5_buf;
+        Buf<TAcc, bool> partOfT5_buf;
+        Buf<TAcc, bool> partOfPT3_buf;
+
+#ifdef CUT_VALUE_DEBUG
+        Buf<TAcc, float> zOut_buf;
+        Buf<TAcc, float> rtOut_buf;
+        Buf<TAcc, float> deltaPhiPos_buf;
+        Buf<TAcc, float> deltaPhi_buf;
+        Buf<TAcc, float> zLo_buf;
+        Buf<TAcc, float> zHi_buf;
+        Buf<TAcc, float> zLoPointed_buf;
+        Buf<TAcc, float> zHiPointed_buf;
+        Buf<TAcc, float> sdlCut_buf;
+        Buf<TAcc, float> betaInCut_buf;
+        Buf<TAcc, float> betaOutCut_buf;
+        Buf<TAcc, float> deltaBetaCut_buf;
+        Buf<TAcc, float> rtLo_buf;
+        Buf<TAcc, float> rtHi_buf;
+        Buf<TAcc, float> kZ_buf;
+#endif
+
+        template<typename TQueue, typename TDevAcc>
+        tripletsBuffer(unsigned int maxTriplets,
+                    unsigned int nLowerModules,
+                    TDevAcc const & devAccIn,
+                    TQueue& queue) :
+            segmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTriplets)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 3 * maxTriplets)),
+            nTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            totOccupancyTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxTriplets * 3)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTriplets * 6)),
+            betaIn_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
+            betaOut_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
+            pt_beta_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets)),
+            partOfT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets)),
+            partOfPT3_buf(allocBufWrapper<bool>(devAccIn, maxTriplets))
+#ifdef CUT_VALUE_DEBUG
+            ,zOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            rtOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            deltaPhiPos_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            deltaPhi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            zLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            zHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            zLoPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            zHiPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            sdlCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            betaInCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            betaOutCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            deltaBetaCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            rtLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            rtHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
+            kZ_buf(allocBufWrapper<float>(devAccIn, maxTriplets))
+#endif
+        {
+            alpaka::memset(queue, nTriplets_buf, 0, nLowerModules);
+            alpaka::memset(queue, totOccupancyTriplets_buf, 0, nLowerModules);
+            alpaka::memset(queue, partOfPT5_buf, 0, maxTriplets);
+            alpaka::memset(queue, partOfT5_buf, 0, maxTriplets);
+            alpaka::memset(queue, partOfPT3_buf, 0, maxTriplets);
+        }
+    };
 
 #ifdef CUT_VALUE_DEBUG
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 83e7f51d..763200ea 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -124,7 +124,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getLSsFromT3(SDL::Event* event, unsigned int T3)
 {
-    SDL::triplets& triplets_ = *(event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& triplets_ = *(event->getTriplets());
     unsigned int LS_1 = triplets_.segmentIndices[2 * T3];
     unsigned int LS_2 = triplets_.segmentIndices[2 * T3 + 1];
     return {LS_1, LS_2};
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 15a8eb39..e1284b34 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -474,7 +474,7 @@ void setQuintupletOutputBranches(SDL::Event* event)
 void setPixelTripletOutputBranches(SDL::Event* event)
 {
     SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets());
-    SDL::triplets& tripletsInGPU = *(event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
@@ -820,7 +820,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 {
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::triplets& tripletsInGPU = (*event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
@@ -958,7 +958,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 {
     // Get relevant information
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::triplets& tripletsInGPU = (*event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
@@ -1005,7 +1005,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parseT5(SDL::Event* event, unsigned int idx)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::triplets& tripletsInGPU = (*event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx];
     std::vector<unsigned int> T3s = getT3sFromT5(event, T5);
@@ -1236,7 +1236,7 @@ void printpLSs(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printT3s(SDL::Event* event)
 {
-    SDL::triplets& tripletsInGPU = (*event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
@@ -1279,7 +1279,7 @@ void printT3s(SDL::Event* event)
 void debugPrintOutlierMultiplicities(SDL::Event* event)
 {
     SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
-    SDL::triplets& tripletsInGPU = (*event->getTriplets());
+    SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::modules& modulesInGPU = (*event->getModules());

From 8901df6be13034975a37e76625fb9b4ed0fc62e5 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 7 Jun 2023 23:44:06 -0400
Subject: [PATCH 20/44] formatting fixes

---
 SDL/PixelTriplet.cuh | 24 ++++++++++++------------
 SDL/Triplet.cuh      | 28 ++++++++++++++--------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 660aaeb7..422d8959 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -48,7 +48,7 @@ namespace SDL
 
     void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
     {
         pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex;
         pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex;
@@ -130,7 +130,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
@@ -663,7 +663,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, float& pixelRadiusError, float& tripletRadius, float& centerX, float& centerY, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, bool runChiSquaredCuts = true)
     {
         bool pass = true;
 
@@ -768,7 +768,7 @@ namespace SDL
                 struct SDL::modules& modulesInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                struct segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::pixelTriplets& pixelTripletsInGPU,
                 unsigned int* connectedPixelSize,
@@ -911,7 +911,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& dPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -1124,7 +1124,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& /*z_OutLo*/, float& /*rt_OutLo*/, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -1385,7 +1385,7 @@ namespace SDL
 
     void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream);
 
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
     {
         pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex;
         pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index;
@@ -1966,7 +1966,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct quintuplets& quintupletsInGPU, unsigned int& pixelSegmentIndex, unsigned int& quintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY, unsigned int pixelSegmentArrayIndex)
     {
         bool pass = true;
 
@@ -2100,7 +2100,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                struct segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::quintuplets& quintupletsInGPU,
                 struct SDL::pixelQuintuplets& pixelQuintupletsInGPU,
@@ -2226,7 +2226,7 @@ namespace SDL
     };
  
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPBB(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& dPhiPos, float& dPhi, float& betaIn,  float& betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaOutCut, float& deltaBetaCut) // pixel to BB and BE segments
     {
         bool pass = true;
 
@@ -2433,7 +2433,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct SDL::segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgoPPEE(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU ,struct segments& segmentsInGPU, uint16_t& pixelModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& z_OutLo, float& rt_OutLo, float& deltaPhiPos, float& dPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ) // pixel to EE segments
     {
         bool pass = true;
         bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS);
@@ -2648,7 +2648,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runpT5DefaultAlgo(TAcc const & acc, struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& pixelLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         zLo = -999;
         zHi = -999;
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index d5b5a81b..6df7b06a 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -160,9 +160,9 @@ namespace SDL
     };
 
 #ifdef CUT_VALUE_DEBUG
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float&zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ, unsigned int& tripletIndex)
 #else
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, float& betaIn, float& betaOut, float& pt_beta, unsigned int& tripletIndex)
 #endif
     {
         tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex;
@@ -209,7 +209,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex) 
     {
         //get the rt and z
         const float& r1 = mdsInGPU.anchorRt[firstMDIndex];
@@ -290,7 +290,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -349,7 +349,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         //unsigned int outerInnerLowerModuleIndex = middleLowerModuleIndex;
@@ -428,7 +428,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         bool pass = true;
         bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -508,7 +508,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, float& zOut, float& rtOut)
     {
         short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex];
         short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex];
@@ -589,7 +589,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBBB(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut)
     {
         bool pass = true;
 
@@ -785,7 +785,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoBBEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
         bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS);
@@ -1000,7 +1000,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoEEEE(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& dPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& rtLo, float& rtHi, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1209,7 +1209,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& innerOuterLowerModuleIndex, uint16_t& outerInnerLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, unsigned int& firstMDIndex, unsigned int& secondMDIndex, unsigned int& thirdMDIndex, unsigned int& fourthMDIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float&betaOut, float& pt_beta, float& zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = false;
 
@@ -1274,7 +1274,7 @@ namespace SDL
     };
 
     template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct SDL::segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const & acc, struct SDL::modules& modulesInGPU, struct SDL::miniDoublets& mdsInGPU, struct segments& segmentsInGPU, uint16_t& innerInnerLowerModuleIndex, uint16_t& middleLowerModuleIndex, uint16_t& outerOuterLowerModuleIndex, unsigned int& innerSegmentIndex, unsigned int& outerSegmentIndex, float& zOut, float& rtOut, float& deltaPhiPos, float& deltaPhi, float& betaIn, float& betaOut, float& pt_beta, float &zLo, float& zHi, float& rtLo, float& rtHi, float& zLoPointed, float& zHiPointed, float& sdlCut, float& betaInCut, float& betaOutCut, float& deltaBetaCut, float& kZ)
     {
         bool pass = true;
 
@@ -1301,7 +1301,7 @@ namespace SDL
                 TAcc const & acc,
                 struct SDL::modules& modulesInGPU,
                 struct SDL::miniDoublets& mdsInGPU,
-                struct SDL::segments& segmentsInGPU,
+                struct segments& segmentsInGPU,
                 struct SDL::triplets& tripletsInGPU,
                 struct SDL::objectRanges& rangesInGPU,
                 uint16_t *index_gpu,
@@ -1376,7 +1376,7 @@ namespace SDL
                 TAcc const & acc,
                 struct modules& modulesInGPU,
                 struct objectRanges& rangesInGPU,
-                struct SDL::segments& segmentsInGPU) const
+                struct segments& segmentsInGPU) const
         {
             using Dim = alpaka::Dim<TAcc>;
             using Idx = alpaka::Idx<TAcc>;

From 24a56cd64083886efd35b28ed5f4b8b7a6f15dbe Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 10:36:00 -0400
Subject: [PATCH 21/44] move quintuplets to Alpaka memory

---
 SDL/Event.cu                  | 102 ++++++++----------------
 SDL/Event.cuh                 |   5 +-
 SDL/Quintuplet.cu             | 145 ----------------------------------
 SDL/Quintuplet.cuh            | 102 ++++++++++++++++++++++--
 code/core/AccessHelper.cc     |   2 +-
 code/core/write_sdl_ntuple.cc |   6 +-
 6 files changed, 135 insertions(+), 227 deletions(-)
 delete mode 100644 SDL/Quintuplet.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 8139af57..65774128 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -60,13 +60,11 @@ SDL::Event::~Event()
 {
 #ifdef CACHE_ALLOC
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
-    if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
-    if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);}
@@ -79,7 +77,7 @@ SDL::Event::~Event()
     if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
     if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
     if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);}
-    if(quintupletsInGPU!= nullptr){cms::cuda::free_host(quintupletsInGPU);}
+    if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;}
 
     if(hitsInCPU != nullptr)
     {
@@ -110,17 +108,6 @@ SDL::Event::~Event()
     }
     if(quintupletsInCPU != nullptr)
     {
-        delete[] quintupletsInCPU->tripletIndices;
-        delete[] quintupletsInCPU->nQuintuplets;
-        delete[] quintupletsInCPU->totOccupancyQuintuplets;
-        delete[] quintupletsInCPU->lowerModuleIndices;
-        delete[] quintupletsInCPU->innerRadius;
-        delete[] quintupletsInCPU->outerRadius;
-        delete[] quintupletsInCPU->regressionRadius;
-        delete[] quintupletsInCPU->bridgeRadius;
-        delete[] quintupletsInCPU->chiSquared;
-        delete[] quintupletsInCPU->rzChiSquared;
-        delete[] quintupletsInCPU->nonAnchorChiSquared;
         delete quintupletsInCPU;
     }
 
@@ -209,12 +196,10 @@ void SDL::Event::resetEvent()
 {
 #ifdef CACHE_ALLOC
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
-    if(quintupletsInGPU){quintupletsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
     if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
-    if(quintupletsInGPU){quintupletsInGPU->freeMemory(stream);}
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
@@ -249,7 +234,7 @@ void SDL::Event::resetEvent()
       segmentsInGPU = nullptr;}
     if(tripletsInGPU){delete tripletsInGPU; delete tripletsBuffers;
       tripletsInGPU = nullptr;}
-    if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU);
+    if(quintupletsInGPU){delete quintupletsInGPU; delete quintupletsBuffers;
       quintupletsInGPU = nullptr;}
     if(trackCandidatesInGPU){cms::cuda::free_host(trackCandidatesInGPU);
       trackCandidatesInGPU = nullptr;}
@@ -288,17 +273,6 @@ void SDL::Event::resetEvent()
     }
     if(quintupletsInCPU != nullptr)
     {
-        delete[] quintupletsInCPU->tripletIndices;
-        delete[] quintupletsInCPU->nQuintuplets;
-        delete[] quintupletsInCPU->totOccupancyQuintuplets;
-        delete[] quintupletsInCPU->lowerModuleIndices;
-        delete[] quintupletsInCPU->innerRadius;
-        delete[] quintupletsInCPU->outerRadius;
-        delete[] quintupletsInCPU->regressionRadius;
-        delete[] quintupletsInCPU->bridgeRadius;
-        delete[] quintupletsInCPU->chiSquared;
-        delete[] quintupletsInCPU->rzChiSquared;
-        delete[] quintupletsInCPU->nonAnchorChiSquared;
         delete quintupletsInCPU;
         quintupletsInCPU = nullptr;
     }
@@ -1226,8 +1200,10 @@ void SDL::Event::createQuintuplets()
 
     if(quintupletsInGPU == nullptr)
     {
-        quintupletsInGPU = (SDL::quintuplets*)cms::cuda::allocate_host(sizeof(SDL::quintuplets), stream);
-        createQuintupletsInExplicitMemory(*quintupletsInGPU, nTotalQuintuplets, nLowerModules, nEligibleT5Modules,stream);
+        quintupletsInGPU = new SDL::quintuplets();
+        quintupletsBuffers = new SDL::quintupletsBuffer<Acc>(nTotalQuintuplets, nLowerModules, devAcc, queue);
+        quintupletsInGPU->setData(*quintupletsBuffers);
+
         cudaMemcpyAsync(quintupletsInGPU->nMemoryLocations, &nTotalQuintuplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
         cudaStreamSynchronize(stream);
     }
@@ -1914,51 +1890,35 @@ SDL::tripletsBuffer<alpaka::DevCpu>* SDL::Event::getTriplets()
     return tripletsInCPU;
 }
 
-SDL::quintuplets* SDL::Event::getQuintuplets()
+SDL::quintupletsBuffer<alpaka::DevCpu>* SDL::Event::getQuintuplets()
 {
     if(quintupletsInCPU == nullptr)
     {
-        quintupletsInCPU = new SDL::quintuplets;
-        uint16_t nEligibleT5Modules;
-        cudaMemcpyAsync(&nEligibleT5Modules, rangesInGPU->nEligibleT5Modules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        unsigned int nMemoryLocations;
-        cudaMemcpyAsync(&nMemoryLocations, quintupletsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
-
-        quintupletsInCPU->nQuintuplets = new int[nLowerModules];
-        quintupletsInCPU->totOccupancyQuintuplets = new int[nLowerModules];
-        quintupletsInCPU->tripletIndices = new unsigned int[2 * nMemoryLocations];
-        quintupletsInCPU->lowerModuleIndices = new uint16_t[5 * nMemoryLocations];
-        quintupletsInCPU->innerRadius = new FPX[nMemoryLocations];
-        quintupletsInCPU->outerRadius = new FPX[nMemoryLocations];
-        quintupletsInCPU->bridgeRadius = new FPX[nMemoryLocations];
-
-        quintupletsInCPU->isDup = new bool[nMemoryLocations];
-        quintupletsInCPU->score_rphisum = new FPX[nMemoryLocations];
-        quintupletsInCPU->eta = new FPX[nMemoryLocations];
-        quintupletsInCPU->phi = new FPX[nMemoryLocations];
-
-        quintupletsInCPU->chiSquared = new float[nMemoryLocations];
-        quintupletsInCPU->nonAnchorChiSquared = new float[nMemoryLocations];
-        quintupletsInCPU->rzChiSquared = new float[nMemoryLocations];
-
-        cudaMemcpyAsync(quintupletsInCPU->nQuintuplets, quintupletsInGPU->nQuintuplets,  nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->totOccupancyQuintuplets, quintupletsInGPU->totOccupancyQuintuplets,  nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->tripletIndices, quintupletsInGPU->tripletIndices, 2 * nMemoryLocations * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->lowerModuleIndices, quintupletsInGPU->lowerModuleIndices, 5 * nMemoryLocations * sizeof(uint16_t), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->innerRadius, quintupletsInGPU->innerRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->bridgeRadius, quintupletsInGPU->bridgeRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(quintupletsInCPU->outerRadius, quintupletsInGPU->outerRadius, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->isDup, quintupletsInGPU->isDup, nMemoryLocations * sizeof(bool), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->score_rphisum, quintupletsInGPU->score_rphisum, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->eta, quintupletsInGPU->eta, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->phi, quintupletsInGPU->phi, nMemoryLocations * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(quintupletsInCPU->chiSquared, quintupletsInGPU->chiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(quintupletsInCPU->rzChiSquared, quintupletsInGPU->rzChiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(quintupletsInCPU->nonAnchorChiSquared, quintupletsInGPU->nonAnchorChiSquared, nMemoryLocations * sizeof(float), cudaMemcpyDeviceToHost, stream);
+        // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nMemLocal_buf, quintupletsBuffers->nMemoryLocations_buf, 1);
+        alpaka::wait(queue);
 
-        cudaStreamSynchronize(stream);
+        unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
+        quintupletsInCPU = new SDL::quintupletsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, devHost, queue);
+        quintupletsInCPU->setData(*quintupletsInCPU);
+
+        *alpaka::getPtrNative(quintupletsInCPU->nMemoryLocations_buf) = nMemLocal;
+        alpaka::memcpy(queue, quintupletsInCPU->nQuintuplets_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules);
+        alpaka::memcpy(queue, quintupletsInCPU->totOccupancyQuintuplets_buf, quintupletsBuffers->totOccupancyQuintuplets_buf, nLowerModules);
+        alpaka::memcpy(queue, quintupletsInCPU->tripletIndices_buf, quintupletsBuffers->tripletIndices_buf, 2 * nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->lowerModuleIndices_buf, quintupletsBuffers->lowerModuleIndices_buf, 5 * nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->innerRadius_buf, quintupletsBuffers->innerRadius_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->bridgeRadius_buf, quintupletsBuffers->bridgeRadius_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->outerRadius_buf, quintupletsBuffers->outerRadius_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->isDup_buf, quintupletsBuffers->isDup_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->score_rphisum_buf, quintupletsBuffers->score_rphisum_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->eta_buf, quintupletsBuffers->eta_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->phi_buf, quintupletsBuffers->phi_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->chiSquared_buf, quintupletsBuffers->chiSquared_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->rzChiSquared_buf, quintupletsBuffers->rzChiSquared_buf, nMemLocal);
+        alpaka::memcpy(queue, quintupletsInCPU->nonAnchorChiSquared_buf, quintupletsBuffers->nonAnchorChiSquared_buf, nMemLocal);
+        alpaka::wait(queue);
     }
     return quintupletsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 8d694888..db173f3e 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -49,6 +49,7 @@ namespace SDL
         struct triplets* tripletsInGPU;
         struct tripletsBuffer<Acc>* tripletsBuffers;
         struct quintuplets* quintupletsInGPU;
+        struct quintupletsBuffer<Acc>* quintupletsBuffers;
         struct trackCandidates* trackCandidatesInGPU;
         struct pixelTriplets* pixelTripletsInGPU;
         struct pixelQuintuplets* pixelQuintupletsInGPU;
@@ -62,7 +63,7 @@ namespace SDL
         trackCandidates* trackCandidatesInCPU;
         modules* modulesInCPU;
         modules* modulesInCPUFull;
-        quintuplets* quintupletsInCPU;
+        quintupletsBuffer<alpaka::DevCpu>* quintupletsInCPU;
         pixelTriplets* pixelTripletsInCPU;
         pixelQuintuplets* pixelQuintupletsInCPU;
 
@@ -139,7 +140,7 @@ namespace SDL
         miniDoublets* getMiniDoublets();
         segmentsBuffer<alpaka::DevCpu>* getSegments() ;
         tripletsBuffer<alpaka::DevCpu>* getTriplets();
-        quintuplets* getQuintuplets();
+        quintupletsBuffer<alpaka::DevCpu>* getQuintuplets();
         trackCandidates* getTrackCandidates();
         trackCandidates* getTrackCandidatesInCMSSW();
         pixelTriplets* getPixelTriplets();
diff --git a/SDL/Quintuplet.cu b/SDL/Quintuplet.cu
deleted file mode 100644
index 2819cf6d..00000000
--- a/SDL/Quintuplet.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-# include "Quintuplet.cuh"
-
-SDL::quintuplets::quintuplets()
-{
-    tripletIndices = nullptr;
-    lowerModuleIndices = nullptr;
-    nQuintuplets = nullptr;
-    totOccupancyQuintuplets = nullptr;
-    innerRadius = nullptr;
-    outerRadius = nullptr;
-    regressionRadius = nullptr;
-    isDup = nullptr;
-    TightCutFlag = nullptr;
-    partOfPT5 = nullptr;
-    pt = nullptr;
-    layer = nullptr;
-    regressionG = nullptr;
-    regressionF = nullptr;
-    logicalLayers = nullptr;
-    hitIndices = nullptr;
-    bridgeRadius = nullptr;
-    chiSquared = nullptr;
-    rzChiSquared = nullptr;
-    nonAnchorChiSquared = nullptr;
-}
-
-SDL::quintuplets::~quintuplets()
-{
-}
-
-void SDL::quintuplets::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev, tripletIndices);
-    cms::cuda::free_device(dev, lowerModuleIndices);
-    cms::cuda::free_device(dev, nQuintuplets);
-    cms::cuda::free_device(dev, totOccupancyQuintuplets);
-    cms::cuda::free_device(dev, innerRadius);
-    cms::cuda::free_device(dev, outerRadius);
-    cms::cuda::free_device(dev, partOfPT5);
-    cms::cuda::free_device(dev, isDup);
-    cms::cuda::free_device(dev, TightCutFlag);
-    cms::cuda::free_device(dev, pt);
-    cms::cuda::free_device(dev, layer);
-    cms::cuda::free_device(dev, regressionG);
-    cms::cuda::free_device(dev, regressionF);
-    cms::cuda::free_device(dev, regressionRadius);
-    cms::cuda::free_device(dev, logicalLayers);
-    cms::cuda::free_device(dev, hitIndices);
-    cms::cuda::free_device(dev, nMemoryLocations);
-    cms::cuda::free_device(dev, bridgeRadius);
-    cms::cuda::free_device(dev, rzChiSquared);
-    cms::cuda::free_device(dev, chiSquared);
-    cms::cuda::free_device(dev, nonAnchorChiSquared);
-}
-
-void SDL::quintuplets::freeMemory(cudaStream_t stream)
-{
-    cudaFree(tripletIndices);
-    cudaFree(lowerModuleIndices);
-    cudaFree(nQuintuplets);
-    cudaFree(totOccupancyQuintuplets);
-    cudaFree(innerRadius);
-    cudaFree(outerRadius);
-    cudaFree(regressionRadius);
-    cudaFree(partOfPT5);
-    cudaFree(isDup);
-    cudaFree(TightCutFlag);
-    cudaFree(pt);
-    cudaFree(layer);
-    cudaFree(regressionG);
-    cudaFree(regressionF);
-    cudaFree(logicalLayers);
-    cudaFree(hitIndices);
-    cudaFree(nMemoryLocations);
-    cudaFree(bridgeRadius);
-    cudaFree(rzChiSquared);
-    cudaFree(chiSquared);
-    cudaFree(nonAnchorChiSquared);
-    cudaStreamSynchronize(stream);
-}
-
-void SDL::createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& nTotalQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    quintupletsInGPU.tripletIndices = (unsigned int*)cms::cuda::allocate_device(dev, 2 * nTotalQuintuplets * sizeof(unsigned int), stream);
-    quintupletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, 5 * nTotalQuintuplets * sizeof(uint16_t), stream);
-    quintupletsInGPU.nQuintuplets = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
-    quintupletsInGPU.totOccupancyQuintuplets = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
-    quintupletsInGPU.innerRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(FPX), stream);
-    quintupletsInGPU.outerRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(FPX), stream);
-    quintupletsInGPU.bridgeRadius = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-
-    quintupletsInGPU.pt = (FPX*)cms::cuda::allocate_device(dev, nTotalQuintuplets *4* sizeof(FPX), stream);
-    quintupletsInGPU.layer = (uint8_t*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(uint8_t), stream);
-    quintupletsInGPU.isDup = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream);
-    quintupletsInGPU.TightCutFlag = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream);
-    quintupletsInGPU.partOfPT5 = (bool*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(bool), stream);
-    quintupletsInGPU.regressionRadius = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-    quintupletsInGPU.regressionG = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-    quintupletsInGPU.regressionF = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-    quintupletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(uint8_t) * 5, stream);
-    quintupletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(unsigned int) * 10, stream);
-    quintupletsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-
-    quintupletsInGPU.rzChiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-    quintupletsInGPU.chiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-    quintupletsInGPU.nonAnchorChiSquared = (float*)cms::cuda::allocate_device(dev, nTotalQuintuplets * sizeof(float), stream);
-#else
-    cudaMalloc(&quintupletsInGPU.tripletIndices, 2 * nTotalQuintuplets * sizeof(unsigned int));
-    cudaMalloc(&quintupletsInGPU.lowerModuleIndices, 5 * nTotalQuintuplets * sizeof(uint16_t));
-    cudaMalloc(&quintupletsInGPU.nQuintuplets, nLowerModules * sizeof(int));
-    cudaMalloc(&quintupletsInGPU.totOccupancyQuintuplets, nLowerModules * sizeof(int));
-    cudaMalloc(&quintupletsInGPU.innerRadius, nTotalQuintuplets * sizeof(FPX));
-    cudaMalloc(&quintupletsInGPU.outerRadius, nTotalQuintuplets * sizeof(FPX));
-    cudaMalloc(&quintupletsInGPU.pt, nTotalQuintuplets *4* sizeof(FPX));
-    cudaMalloc(&quintupletsInGPU.isDup, nTotalQuintuplets * sizeof(bool));
-    cudaMalloc(&quintupletsInGPU.TightCutFlag, nTotalQuintuplets * sizeof(bool));
-    cudaMalloc(&quintupletsInGPU.partOfPT5, nTotalQuintuplets * sizeof(bool));
-    cudaMalloc(&quintupletsInGPU.layer, nTotalQuintuplets * sizeof(uint8_t));
-    cudaMalloc(&quintupletsInGPU.regressionRadius, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.regressionG, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.regressionF, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.logicalLayers, nTotalQuintuplets * 5 * sizeof(uint8_t));
-    cudaMalloc(&quintupletsInGPU.hitIndices, nTotalQuintuplets * 10 * sizeof(unsigned int));
-    cudaMalloc(&quintupletsInGPU.nMemoryLocations, sizeof(unsigned int));
-    cudaMalloc(&quintupletsInGPU.bridgeRadius, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.rzChiSquared, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.chiSquared, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.nonAnchorChiSquared, nTotalQuintuplets * sizeof(float));
-    cudaMalloc(&quintupletsInGPU.nMemoryLocations, sizeof(unsigned int));
-#endif
-    cudaMemsetAsync(quintupletsInGPU.nQuintuplets,0,nLowerModules * sizeof(int),stream);
-    cudaMemsetAsync(quintupletsInGPU.totOccupancyQuintuplets,0,nLowerModules * sizeof(int),stream);
-    cudaMemsetAsync(quintupletsInGPU.isDup,0,nTotalQuintuplets * sizeof(bool),stream);
-    cudaMemsetAsync(quintupletsInGPU.TightCutFlag,0,nTotalQuintuplets * sizeof(bool),stream);
-    cudaMemsetAsync(quintupletsInGPU.partOfPT5,0,nTotalQuintuplets * sizeof(bool),stream);
-    cudaStreamSynchronize(stream);
-    quintupletsInGPU.eta = quintupletsInGPU.pt + nTotalQuintuplets;
-    quintupletsInGPU.phi = quintupletsInGPU.pt + 2*nTotalQuintuplets;
-    quintupletsInGPU.score_rphisum = quintupletsInGPU.pt + 3*nTotalQuintuplets;
-}
\ No newline at end of file
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index 0f408a30..3a8aa7e6 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -41,13 +41,105 @@ namespace SDL
         float* chiSquared;
         float* nonAnchorChiSquared;
 
-        quintuplets();
-        ~quintuplets();
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
+        template<typename TBuff>
+        void setData(TBuff& quintupletsbuf)
+        {
+            tripletIndices = alpaka::getPtrNative(quintupletsbuf.tripletIndices_buf);
+            lowerModuleIndices = alpaka::getPtrNative(quintupletsbuf.lowerModuleIndices_buf);
+            nQuintuplets = alpaka::getPtrNative(quintupletsbuf.nQuintuplets_buf);
+            totOccupancyQuintuplets = alpaka::getPtrNative(quintupletsbuf.totOccupancyQuintuplets_buf);
+            nMemoryLocations = alpaka::getPtrNative(quintupletsbuf.nMemoryLocations_buf);
+            innerRadius = alpaka::getPtrNative(quintupletsbuf.innerRadius_buf);
+            bridgeRadius = alpaka::getPtrNative(quintupletsbuf.bridgeRadius_buf);
+            outerRadius = alpaka::getPtrNative(quintupletsbuf.outerRadius_buf);
+            pt = alpaka::getPtrNative(quintupletsbuf.pt_buf);
+            eta = alpaka::getPtrNative(quintupletsbuf.eta_buf);
+            phi = alpaka::getPtrNative(quintupletsbuf.phi_buf);
+            score_rphisum = alpaka::getPtrNative(quintupletsbuf.score_rphisum_buf);
+            layer = alpaka::getPtrNative(quintupletsbuf.layer_buf);
+            isDup = alpaka::getPtrNative(quintupletsbuf.isDup_buf);
+            TightCutFlag = alpaka::getPtrNative(quintupletsbuf.TightCutFlag_buf);
+            partOfPT5 = alpaka::getPtrNative(quintupletsbuf.partOfPT5_buf);
+            regressionRadius = alpaka::getPtrNative(quintupletsbuf.regressionRadius_buf);
+            regressionG = alpaka::getPtrNative(quintupletsbuf.regressionG_buf);
+            regressionF = alpaka::getPtrNative(quintupletsbuf.regressionF_buf);
+            logicalLayers = alpaka::getPtrNative(quintupletsbuf.logicalLayers_buf);
+            hitIndices = alpaka::getPtrNative(quintupletsbuf.hitIndices_buf);
+            rzChiSquared = alpaka::getPtrNative(quintupletsbuf.rzChiSquared_buf);
+            chiSquared = alpaka::getPtrNative(quintupletsbuf.chiSquared_buf);
+            nonAnchorChiSquared = alpaka::getPtrNative(quintupletsbuf.nonAnchorChiSquared_buf);
+        }
     };
 
-    void createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& maxQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream);
+    template<typename TAcc>
+    struct quintupletsBuffer : quintuplets
+    {
+        Buf<TAcc, unsigned int> tripletIndices_buf;
+        Buf<TAcc, uint16_t> lowerModuleIndices_buf;
+        Buf<TAcc, int> nQuintuplets_buf;
+        Buf<TAcc, int> totOccupancyQuintuplets_buf;
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
+
+        Buf<TAcc, FPX> innerRadius_buf;
+        Buf<TAcc, FPX> bridgeRadius_buf;
+        Buf<TAcc, FPX> outerRadius_buf;
+        Buf<TAcc, FPX> pt_buf;
+        Buf<TAcc, FPX> eta_buf;
+        Buf<TAcc, FPX> phi_buf;
+        Buf<TAcc, FPX> score_rphisum_buf;
+        Buf<TAcc, uint8_t> layer_buf;
+        Buf<TAcc, bool> isDup_buf;
+        Buf<TAcc, bool> TightCutFlag_buf;
+        Buf<TAcc, bool> partOfPT5_buf;
+
+        Buf<TAcc, float> regressionRadius_buf;
+        Buf<TAcc, float> regressionG_buf;
+        Buf<TAcc, float> regressionF_buf;
+
+        Buf<TAcc, uint8_t> logicalLayers_buf;
+        Buf<TAcc, unsigned int> hitIndices_buf;
+        Buf<TAcc, float> rzChiSquared_buf;
+        Buf<TAcc, float> chiSquared_buf;
+        Buf<TAcc, float> nonAnchorChiSquared_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        quintupletsBuffer(unsigned int nTotalQuintuplets,
+                          unsigned int nLowerModules,
+                          TDevAcc const & devAccIn,
+                          TQueue& queue) :
+            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * nTotalQuintuplets)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 5 * nTotalQuintuplets)),
+            nQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            totOccupancyQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            innerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            bridgeRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            outerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            pt_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            score_rphisum_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
+            layer_buf(allocBufWrapper<uint8_t>(devAccIn, nTotalQuintuplets)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
+            TightCutFlag_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
+            regressionRadius_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
+            regressionG_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
+            regressionF_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 5 * nTotalQuintuplets)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 10 * nTotalQuintuplets)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
+            chiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
+            nonAnchorChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets))
+        {
+            alpaka::memset(queue, nQuintuplets_buf, 0, nLowerModules);
+            alpaka::memset(queue, totOccupancyQuintuplets_buf, 0, nLowerModules);
+            alpaka::memset(queue, isDup_buf, 0, nTotalQuintuplets);
+            alpaka::memset(queue, TightCutFlag_buf, 0, nTotalQuintuplets);
+            alpaka::memset(queue, partOfPT5_buf, 0, nTotalQuintuplets);
+            alpaka::wait(queue);
+        }
+    };
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE bool checkIntervalOverlap(const float& firstMin, const float& firstMax, const float& secondMin, const float& secondMax)
     {
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 763200ea..5e95ed21 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -162,7 +162,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getT3sFromT5(SDL::Event* event, unsigned int T5)
 {
-    SDL::quintuplets& quintuplets_ = *(event->getQuintuplets());
+    SDL::quintupletsBuffer<alpaka::DevCpu>& quintuplets_ = *(event->getQuintuplets());
     unsigned int T3_1 = quintuplets_.tripletIndices[2 * T5];
     unsigned int T3_2 = quintuplets_.tripletIndices[2 * T5 + 1];
     return {T3_1, T3_2};
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index e1284b34..abadd16b 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -306,7 +306,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
 {
     // ============ pT5 =============
     SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
-    SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
+    SDL::quintupletsBuffer<alpaka::DevCpu>& quintupletsInGPU = (*event->getQuintuplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::modules& modulesInGPU = (*event->getModules());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
@@ -391,7 +391,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void setQuintupletOutputBranches(SDL::Event* event)
 {
-    SDL::quintuplets& quintupletsInGPU = (*event->getQuintuplets());
+    SDL::quintupletsBuffer<alpaka::DevCpu>& quintupletsInGPU = (*event->getQuintuplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const float kRinv1GeVf = (2.99792458e-3 * 3.8);
@@ -409,7 +409,7 @@ void setQuintupletOutputBranches(SDL::Event* event)
             float pt = quintupletsInGPU.innerRadius[quintupletIndex] * kRinv1GeVf;
             float eta = __H2F(quintupletsInGPU.eta[quintupletIndex]);
             float phi = __H2F(quintupletsInGPU.phi[quintupletIndex]);
-            
+
             std::vector<unsigned int> hit_idx = getHitIdxsFromT5(event, quintupletIndex);
             std::vector<unsigned int> hit_type = getHitTypesFromT5(event, quintupletIndex);
             std::vector<unsigned int> module_idx = getModuleIdxsFromT5(event, quintupletIndex);

From dbadfd9a79beafef0c6fbc1ba0e469caaa45f540 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 12:51:51 -0400
Subject: [PATCH 22/44] move trackcans to Alpaka memory

---
 SDL/Event.cu                  |  95 +++++++++++---------------
 SDL/Event.cuh                 |   7 +-
 SDL/LST.cc                    |   2 +-
 SDL/TrackCandidate.cu         | 123 ----------------------------------
 SDL/TrackCandidate.cuh        |  79 ++++++++++++++++++++--
 code/core/AccessHelper.cc     |   4 +-
 code/core/write_sdl_ntuple.cc |  16 ++---
 7 files changed, 128 insertions(+), 198 deletions(-)
 delete mode 100644 SDL/TrackCandidate.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 65774128..5a325624 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -62,18 +62,16 @@ SDL::Event::~Event()
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
-    if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
-    if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);}
 #endif
     if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
     if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
     if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
     if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;}
-    if(trackCandidatesInGPU!= nullptr){cms::cuda::free_host(trackCandidatesInGPU);}
+    if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;}
     if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
     if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
     if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);}
@@ -141,11 +139,6 @@ SDL::Event::~Event()
 
     if(trackCandidatesInCPU != nullptr)
     {
-        delete[] trackCandidatesInCPU->objectIndices;
-        delete[] trackCandidatesInCPU->trackCandidateType;
-        delete[] trackCandidatesInCPU->nTrackCandidates;
-        delete[] trackCandidatesInCPU->hitIndices;
-        delete[] trackCandidatesInCPU->logicalLayers;
         delete trackCandidatesInCPU;
     }
 
@@ -198,12 +191,10 @@ void SDL::Event::resetEvent()
     if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
-    if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemoryCache();}
 #else
     if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
-    if(trackCandidatesInGPU){trackCandidatesInGPU->freeMemory(stream);}
 #endif
     //reset the arrays
     for(int i = 0; i<6; i++)
@@ -236,7 +227,7 @@ void SDL::Event::resetEvent()
       tripletsInGPU = nullptr;}
     if(quintupletsInGPU){delete quintupletsInGPU; delete quintupletsBuffers;
       quintupletsInGPU = nullptr;}
-    if(trackCandidatesInGPU){cms::cuda::free_host(trackCandidatesInGPU);
+    if(trackCandidatesInGPU){delete trackCandidatesInGPU; delete trackCandidatesBuffers;
       trackCandidatesInGPU = nullptr;}
     if(pixelTripletsInGPU){cms::cuda::free_host(pixelTripletsInGPU);
       pixelTripletsInGPU = nullptr;}
@@ -306,12 +297,6 @@ void SDL::Event::resetEvent()
     }
     if(trackCandidatesInCPU != nullptr)
     {
-        delete[] trackCandidatesInCPU->objectIndices;
-        delete[] trackCandidatesInCPU->trackCandidateType;
-        delete[] trackCandidatesInCPU->nTrackCandidates;
-        delete[] trackCandidatesInCPU->logicalLayers;
-        delete[] trackCandidatesInCPU->hitIndices;
-        delete[] trackCandidatesInCPU->lowerModuleIndices;
         delete trackCandidatesInCPU;
         trackCandidatesInCPU = nullptr;
     }
@@ -796,7 +781,6 @@ void SDL::Event::createSegmentsWithModuleMap()
     }
 }
 
-
 void SDL::Event::createTriplets()
 {
     if(tripletsInGPU == nullptr)
@@ -908,8 +892,9 @@ void SDL::Event::createTrackCandidates()
     cudaMemcpyAsync(&nEligibleModules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
     if(trackCandidatesInGPU == nullptr)
     {
-        trackCandidatesInGPU = (SDL::trackCandidates*)cms::cuda::allocate_host(sizeof(SDL::trackCandidates), stream);
-        createTrackCandidatesInExplicitMemory(*trackCandidatesInGPU, N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES,stream);
+        trackCandidatesInGPU = new SDL::trackCandidates();
+        trackCandidatesBuffers = new SDL::trackCandidatesBuffer<Acc>(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue);
+        trackCandidatesInGPU->setData(*trackCandidatesBuffers);
     }
 
     Vec const threadsPerBlock_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(64));
@@ -1288,8 +1273,9 @@ void SDL::Event::createPixelQuintuplets()
     }
     if(trackCandidatesInGPU == nullptr)
     {
-        trackCandidatesInGPU = (SDL::trackCandidates*)cms::cuda::allocate_host(sizeof(SDL::trackCandidates), stream);
-        createTrackCandidatesInExplicitMemory(*trackCandidatesInGPU, N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES,stream);
+        trackCandidatesInGPU = new SDL::trackCandidates();
+        trackCandidatesBuffers = new SDL::trackCandidatesBuffer<Acc>(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue);
+        trackCandidatesInGPU->setData(*trackCandidatesBuffers);
     } 
 
     unsigned int pixelModuleIndex;
@@ -1997,52 +1983,49 @@ SDL::pixelQuintuplets* SDL::Event::getPixelQuintuplets()
     return pixelQuintupletsInCPU;
 }
 
-SDL::trackCandidates* SDL::Event::getTrackCandidates()
+SDL::trackCandidatesBuffer<alpaka::DevCpu>* SDL::Event::getTrackCandidates()
 {
     if(trackCandidatesInCPU == nullptr)
     {
-        trackCandidatesInCPU = new SDL::trackCandidates;
-        trackCandidatesInCPU->nTrackCandidates = new int;
-        cudaMemcpyAsync(trackCandidatesInCPU->nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        int nTrackCandidates = *(trackCandidatesInCPU->nTrackCandidates);
-
-        trackCandidatesInCPU->directObjectIndices = new unsigned int[nTrackCandidates];
-        trackCandidatesInCPU->objectIndices = new unsigned int[2 * nTrackCandidates];
-        trackCandidatesInCPU->trackCandidateType = new short[nTrackCandidates];
-        trackCandidatesInCPU->hitIndices = new unsigned int[14 * nTrackCandidates];
-        trackCandidatesInCPU->pixelSeedIndex = new int[nTrackCandidates];
-        trackCandidatesInCPU->logicalLayers = new uint8_t[7 * nTrackCandidates];
-
-        cudaMemcpyAsync(trackCandidatesInCPU->hitIndices, trackCandidatesInGPU->hitIndices, 14 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(trackCandidatesInCPU->pixelSeedIndex, trackCandidatesInGPU->pixelSeedIndex, nTrackCandidates * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(trackCandidatesInCPU->logicalLayers, trackCandidatesInGPU->logicalLayers, 7 * nTrackCandidates * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(trackCandidatesInCPU->directObjectIndices, trackCandidatesInGPU->directObjectIndices, nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);                                                                                    
-        cudaMemcpyAsync(trackCandidatesInCPU->objectIndices, trackCandidatesInGPU->objectIndices, 2 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);                                                                                    
-        cudaMemcpyAsync(trackCandidatesInCPU->trackCandidateType, trackCandidatesInGPU->trackCandidateType, nTrackCandidates * sizeof(short), cudaMemcpyDeviceToHost,stream);                                                                                                                
-        cudaStreamSynchronize(stream);
+        // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU
+        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1);
+        alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
+        alpaka::wait(queue);
+
+        int nTrackLocal = *alpaka::getPtrNative(nTrackLocal_buf);
+        trackCandidatesInCPU = new SDL::trackCandidatesBuffer<alpaka::DevCpu>(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue);
+        trackCandidatesInCPU->setData(*trackCandidatesInCPU);
+
+        *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackLocal;
+        alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->logicalLayers_buf, trackCandidatesBuffers->logicalLayers_buf, 7 * nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->directObjectIndices_buf, trackCandidatesBuffers->directObjectIndices_buf, nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->objectIndices_buf, trackCandidatesBuffers->objectIndices_buf, 2 * nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackLocal);
+        alpaka::wait(queue);
     }
     return trackCandidatesInCPU;
 }
 
-SDL::trackCandidates* SDL::Event::getTrackCandidatesInCMSSW()
+SDL::trackCandidatesBuffer<alpaka::DevCpu>* SDL::Event::getTrackCandidatesInCMSSW()
 {
     if(trackCandidatesInCPU == nullptr)
     {
-        trackCandidatesInCPU = new SDL::trackCandidates;
-        trackCandidatesInCPU->nTrackCandidates = new int;
-        cudaMemcpyAsync(trackCandidatesInCPU->nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        unsigned int nTrackCandidates = *(trackCandidatesInCPU->nTrackCandidates);
+        // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU
+        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1);
+        alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
+        alpaka::wait(queue);
 
-        trackCandidatesInCPU->trackCandidateType = new short[nTrackCandidates];
-        trackCandidatesInCPU->hitIndices = new unsigned int[14 * nTrackCandidates];
-        trackCandidatesInCPU->pixelSeedIndex = new int[nTrackCandidates];
+        int nTrackLocal = *alpaka::getPtrNative(nTrackLocal_buf);
+        trackCandidatesInCPU = new SDL::trackCandidatesBuffer<alpaka::DevCpu>(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue);
+        trackCandidatesInCPU->setData(*trackCandidatesInCPU);
 
-        cudaMemcpyAsync(trackCandidatesInCPU->hitIndices, trackCandidatesInGPU->hitIndices, 14 * nTrackCandidates * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(trackCandidatesInCPU->pixelSeedIndex, trackCandidatesInGPU->pixelSeedIndex, nTrackCandidates * sizeof(int), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(trackCandidatesInCPU->trackCandidateType, trackCandidatesInGPU->trackCandidateType, nTrackCandidates * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackLocal;
+        alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackLocal);
+        alpaka::memcpy(queue, trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackLocal);
+        alpaka::wait(queue);
     }
     return trackCandidatesInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index db173f3e..bbdd93f1 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -51,6 +51,7 @@ namespace SDL
         struct quintuplets* quintupletsInGPU;
         struct quintupletsBuffer<Acc>* quintupletsBuffers;
         struct trackCandidates* trackCandidatesInGPU;
+        struct trackCandidatesBuffer<Acc>* trackCandidatesBuffers;
         struct pixelTriplets* pixelTripletsInGPU;
         struct pixelQuintuplets* pixelQuintupletsInGPU;
 
@@ -60,7 +61,7 @@ namespace SDL
         miniDoublets* mdsInCPU;
         segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
         tripletsBuffer<alpaka::DevCpu>* tripletsInCPU;
-        trackCandidates* trackCandidatesInCPU;
+        trackCandidatesBuffer<alpaka::DevCpu>* trackCandidatesInCPU;
         modules* modulesInCPU;
         modules* modulesInCPUFull;
         quintupletsBuffer<alpaka::DevCpu>* quintupletsInCPU;
@@ -141,8 +142,8 @@ namespace SDL
         segmentsBuffer<alpaka::DevCpu>* getSegments() ;
         tripletsBuffer<alpaka::DevCpu>* getTriplets();
         quintupletsBuffer<alpaka::DevCpu>* getQuintuplets();
-        trackCandidates* getTrackCandidates();
-        trackCandidates* getTrackCandidatesInCMSSW();
+        trackCandidatesBuffer<alpaka::DevCpu>* getTrackCandidates();
+        trackCandidatesBuffer<alpaka::DevCpu>* getTrackCandidatesInCMSSW();
         pixelTriplets* getPixelTriplets();
         modules* getModules();
         modules* getFullModules();
diff --git a/SDL/LST.cc b/SDL/LST.cc
index a46a6167..9f9930c3 100644
--- a/SDL/LST.cc
+++ b/SDL/LST.cc
@@ -404,7 +404,7 @@ void SDL::LST::getOutput(SDL::Event& event) {
     std::vector<short> tc_trackCandidateType_;
 
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event.getHitsInCMSSW());
-    SDL::trackCandidates& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW());
 
     unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates;
     for (unsigned int idx = 0; idx < nTrackCandidates; idx++) {
diff --git a/SDL/TrackCandidate.cu b/SDL/TrackCandidate.cu
deleted file mode 100644
index 7853de30..00000000
--- a/SDL/TrackCandidate.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "TrackCandidate.cuh"
-
-void SDL::createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    trackCandidatesInGPU.trackCandidateType = (short*)cms::cuda::allocate_device(dev,maxTrackCandidates * sizeof(short),stream);
-    trackCandidatesInGPU.directObjectIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTrackCandidates * sizeof(unsigned int),stream);
-    trackCandidatesInGPU.objectIndices = (unsigned int*)cms::cuda::allocate_device(dev,maxTrackCandidates * 2*sizeof(unsigned int),stream);
-    trackCandidatesInGPU.nTrackCandidates= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream);
-    trackCandidatesInGPU.nTrackCandidatespT3= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream);
-    trackCandidatesInGPU.nTrackCandidatesT5= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream);
-    trackCandidatesInGPU.nTrackCandidatespT5= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream);
-    trackCandidatesInGPU.nTrackCandidatespLS= (int*)cms::cuda::allocate_device(dev, sizeof(int),stream);
-
-    trackCandidatesInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, 7 * maxTrackCandidates * sizeof(uint8_t), stream);
-    trackCandidatesInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, 7 * maxTrackCandidates * sizeof(uint16_t), stream);
-    trackCandidatesInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, 14 * maxTrackCandidates * sizeof(unsigned int), stream);
-    trackCandidatesInGPU.pixelSeedIndex = (int*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(int), stream);
-    trackCandidatesInGPU.centerX = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream);
-    trackCandidatesInGPU.centerY = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream);
-    trackCandidatesInGPU.radius  = (FPX*)cms::cuda::allocate_device(dev, maxTrackCandidates * sizeof(FPX), stream);
-
-#else
-    cudaMalloc(&trackCandidatesInGPU.trackCandidateType, maxTrackCandidates * sizeof(short));
-    cudaMalloc(&trackCandidatesInGPU.directObjectIndices, maxTrackCandidates * sizeof(unsigned int));
-    cudaMalloc(&trackCandidatesInGPU.objectIndices, 2 * maxTrackCandidates * sizeof(unsigned int));
-    cudaMalloc(&trackCandidatesInGPU.nTrackCandidates, sizeof(int));
-    cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespT3, sizeof(int));
-    cudaMalloc(&trackCandidatesInGPU.nTrackCandidatesT5, sizeof(int));
-    cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespT5, sizeof(int));
-    cudaMalloc(&trackCandidatesInGPU.nTrackCandidatespLS, sizeof(int));
-
-    cudaMalloc(&trackCandidatesInGPU.logicalLayers, 7 * maxTrackCandidates * sizeof(uint8_t));
-    cudaMalloc(&trackCandidatesInGPU.lowerModuleIndices, 7 * maxTrackCandidates * sizeof(uint16_t));
-    cudaMalloc(&trackCandidatesInGPU.hitIndices, 14 * maxTrackCandidates * sizeof(unsigned int));
-    cudaMalloc(&trackCandidatesInGPU.pixelSeedIndex, maxTrackCandidates * sizeof(int));
-    cudaMalloc(&trackCandidatesInGPU.centerX, maxTrackCandidates * sizeof(FPX));
-    cudaMalloc(&trackCandidatesInGPU.centerY, maxTrackCandidates * sizeof(FPX));
-    cudaMalloc(&trackCandidatesInGPU.radius , maxTrackCandidates * sizeof(FPX));
-#endif
-    cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidates,0, sizeof(int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatesT5,0, sizeof(int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespT3,0, sizeof(int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespT5,0, sizeof(int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.nTrackCandidatespLS,0, sizeof(int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.logicalLayers, 0, 7 * maxTrackCandidates * sizeof(uint8_t), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.lowerModuleIndices, 0, 7 * maxTrackCandidates * sizeof(uint16_t), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.hitIndices, 0, 14 * maxTrackCandidates * sizeof(unsigned int), stream);
-    cudaMemsetAsync(trackCandidatesInGPU.pixelSeedIndex, 0, maxTrackCandidates * sizeof(int), stream);
-    cudaStreamSynchronize(stream);
-}
-
-SDL::trackCandidates::trackCandidates()
-{
-    trackCandidateType = nullptr;
-    directObjectIndices = nullptr;
-    objectIndices = nullptr;
-    nTrackCandidates = nullptr;
-    nTrackCandidatesT5 = nullptr;
-    nTrackCandidatespT3 = nullptr;
-    nTrackCandidatespT5 = nullptr;
-    nTrackCandidatespLS = nullptr;
-
-    logicalLayers = nullptr;
-    hitIndices = nullptr;
-    pixelSeedIndex = nullptr;
-    lowerModuleIndices = nullptr;
-    centerX = nullptr;
-    centerY = nullptr;
-    radius = nullptr;
-}
-
-SDL::trackCandidates::~trackCandidates()
-{
-}
-
-void SDL::trackCandidates::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    //FIXME
-    //cudaFree(trackCandidateType);
-    cms::cuda::free_device(dev,directObjectIndices);
-    cms::cuda::free_device(dev,objectIndices);
-    cms::cuda::free_device(dev,trackCandidateType);
-    cms::cuda::free_device(dev,nTrackCandidates);
-    cms::cuda::free_device(dev,nTrackCandidatespT3);
-    cms::cuda::free_device(dev,nTrackCandidatesT5);
-    cms::cuda::free_device(dev,nTrackCandidatespT5);
-    cms::cuda::free_device(dev,nTrackCandidatespLS);
-
-    cms::cuda::free_device(dev, logicalLayers);
-    cms::cuda::free_device(dev, hitIndices);
-    cms::cuda::free_device(dev, pixelSeedIndex);
-    cms::cuda::free_device(dev, lowerModuleIndices);
-    cms::cuda::free_device(dev, centerX);
-    cms::cuda::free_device(dev, centerY);
-    cms::cuda::free_device(dev, radius);
-}
-
-void SDL::trackCandidates::freeMemory(cudaStream_t stream)
-{
-    cudaFree(trackCandidateType);
-    cudaFree(directObjectIndices);
-    cudaFree(objectIndices);
-    cudaFree(nTrackCandidates);
-    cudaFree(nTrackCandidatespT3);
-    cudaFree(nTrackCandidatesT5);
-    cudaFree(nTrackCandidatespT5);
-    cudaFree(nTrackCandidatespLS);
-
-    cudaFree(logicalLayers);
-    cudaFree(hitIndices);
-    cudaFree(pixelSeedIndex);
-    cudaFree(lowerModuleIndices);
-    cudaFree(centerX);
-    cudaFree(centerY);
-    cudaFree(radius);
-    
-    cudaStreamSynchronize(stream);
-}
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index d81a570d..c11ae247 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -32,13 +32,82 @@ namespace SDL
         FPX* centerY;
         FPX* radius;
 
-        trackCandidates();
-        ~trackCandidates();
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
+        template<typename TBuff>
+        void setData(TBuff& trackCandidatesbuf)
+        {
+            trackCandidateType = alpaka::getPtrNative(trackCandidatesbuf.trackCandidateType_buf);
+            directObjectIndices = alpaka::getPtrNative(trackCandidatesbuf.directObjectIndices_buf);
+            objectIndices = alpaka::getPtrNative(trackCandidatesbuf.objectIndices_buf);
+            nTrackCandidates = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidates_buf);
+            nTrackCandidatespT3 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT3_buf);
+            nTrackCandidatespT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT5_buf);
+            nTrackCandidatespLS = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespLS_buf);
+            nTrackCandidatesT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatesT5_buf);
+
+            logicalLayers = alpaka::getPtrNative(trackCandidatesbuf.logicalLayers_buf);
+            hitIndices = alpaka::getPtrNative(trackCandidatesbuf.hitIndices_buf);
+            pixelSeedIndex = alpaka::getPtrNative(trackCandidatesbuf.pixelSeedIndex_buf);
+            lowerModuleIndices = alpaka::getPtrNative(trackCandidatesbuf.lowerModuleIndices_buf);
+
+            centerX = alpaka::getPtrNative(trackCandidatesbuf.centerX_buf);
+            centerY = alpaka::getPtrNative(trackCandidatesbuf.centerY_buf);
+            radius = alpaka::getPtrNative(trackCandidatesbuf.radius_buf);
+        }
     };
 
-    void createTrackCandidatesInExplicitMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int maxTrackCandidates,cudaStream_t stream);
+    template<typename TAcc>
+    struct trackCandidatesBuffer : trackCandidates
+    {
+        Buf<TAcc, short> trackCandidateType_buf;
+        Buf<TAcc, unsigned int> directObjectIndices_buf;
+        Buf<TAcc, unsigned int> objectIndices_buf;
+        Buf<TAcc, int> nTrackCandidates_buf;
+        Buf<TAcc, int> nTrackCandidatespT3_buf;
+        Buf<TAcc, int> nTrackCandidatespT5_buf;
+        Buf<TAcc, int> nTrackCandidatespLS_buf;
+        Buf<TAcc, int> nTrackCandidatesT5_buf;
+
+        Buf<TAcc, uint8_t> logicalLayers_buf;
+        Buf<TAcc, unsigned int> hitIndices_buf;
+        Buf<TAcc, int> pixelSeedIndex_buf;
+        Buf<TAcc, uint16_t> lowerModuleIndices_buf;
+
+        Buf<TAcc, FPX> centerX_buf;
+        Buf<TAcc, FPX> centerY_buf;
+        Buf<TAcc, FPX> radius_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        trackCandidatesBuffer(unsigned int maxTrackCandidates,
+                            TDevAcc const & devAccIn,
+                            TQueue& queue) :
+            trackCandidateType_buf(allocBufWrapper<short>(devAccIn, maxTrackCandidates)),
+            directObjectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTrackCandidates)),
+            objectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTrackCandidates)),
+            nTrackCandidates_buf(allocBufWrapper<int>(devAccIn, 1)),
+            nTrackCandidatespT3_buf(allocBufWrapper<int>(devAccIn, 1)),
+            nTrackCandidatespT5_buf(allocBufWrapper<int>(devAccIn, 1)),
+            nTrackCandidatespLS_buf(allocBufWrapper<int>(devAccIn, 1)),
+            nTrackCandidatesT5_buf(allocBufWrapper<int>(devAccIn, 1)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 7 * maxTrackCandidates)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 14 * maxTrackCandidates)),
+            pixelSeedIndex_buf(allocBufWrapper<int>(devAccIn, maxTrackCandidates)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 7 * maxTrackCandidates)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates)),
+            radius_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates))
+        {
+            alpaka::memset(queue, nTrackCandidates_buf, 0, 1);
+            alpaka::memset(queue, nTrackCandidatesT5_buf, 0, 1);
+            alpaka::memset(queue, nTrackCandidatespT3_buf, 0, 1);
+            alpaka::memset(queue, nTrackCandidatespT5_buf, 0, 1);
+            alpaka::memset(queue, nTrackCandidatespLS_buf, 0, 1);
+            alpaka::memset(queue, logicalLayers_buf, 0, 7 * maxTrackCandidates);
+            alpaka::memset(queue, lowerModuleIndices_buf, 0, 7 * maxTrackCandidates);
+            alpaka::memset(queue, hitIndices_buf, 0, 14 * maxTrackCandidates);
+            alpaka::memset(queue, pixelSeedIndex_buf, 0, maxTrackCandidates);
+            alpaka::wait(queue);
+        }
+    };
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addpLSTrackCandidateToMemory(struct trackCandidates& trackCandidatesInGPU, unsigned int trackletIndex, unsigned int trackCandidateIndex, uint4 hitIndices, int pixelSeedIndex)
     {
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 5e95ed21..bf6025db 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -452,7 +452,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 std::vector<unsigned int> getLSsFromTC(SDL::Event* event, unsigned int TC)
 {
     // Get the type of the track candidate
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     short type = trackCandidatesInGPU.trackCandidateType[TC];
     unsigned int objidx = trackCandidatesInGPU.directObjectIndices[TC];
     switch (type)
@@ -468,7 +468,7 @@ std::vector<unsigned int> getLSsFromTC(SDL::Event* event, unsigned int TC)
 std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHitTypesFromTC(SDL::Event* event, unsigned TC)
 {
     // Get the type of the track candidate
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     short type = trackCandidatesInGPU.trackCandidateType[TC];
     unsigned int objidx = trackCandidatesInGPU.directObjectIndices[TC];
     switch (type)
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index abadd16b..78abd3d2 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -232,7 +232,7 @@ void setOutputBranches(SDL::Event* event)
     std::vector<std::vector<int>> tc_matched_simIdx;
 
     // ============ Track candidates =============
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates;
     for (unsigned int idx = 0; idx < nTrackCandidates; idx++)
     {
@@ -564,7 +564,7 @@ void setGnnNtupleBranches(SDL::Event* event)
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
 
     std::set<unsigned int> mds_used_in_sg;
     std::map<unsigned int, unsigned int> md_index_map;
@@ -785,7 +785,7 @@ void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD)
 std::tuple<int, float, float, float, int, vector<int>> parseTrackCandidate(SDL::Event* event, unsigned int idx)
 {
     // Get the type of the track candidate
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     short type = trackCandidatesInGPU.trackCandidateType[idx];
 
     enum
@@ -819,7 +819,7 @@ std::tuple<int, float, float, float, int, vector<int>> parseTrackCandidate(SDL::
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepT5(SDL::Event* event, unsigned int idx)
 {
     // Get relevant information
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
@@ -957,7 +957,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepT3(SDL::Event* event, unsigned int idx)
 {
     // Get relevant information
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
@@ -1004,7 +1004,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 //________________________________________________________________________________________________________________________________
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parseT5(SDL::Event* event, unsigned int idx)
 {
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     unsigned int T5 = trackCandidatesInGPU.directObjectIndices[idx];
@@ -1058,7 +1058,7 @@ std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> pars
 //________________________________________________________________________________________________________________________________
 std::tuple<float, float, float, vector<unsigned int>, vector<unsigned int>> parsepLS(SDL::Event* event, unsigned int idx)
 {
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
 
     // Getting pLS index
@@ -1278,7 +1278,7 @@ void printT3s(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void debugPrintOutlierMultiplicities(SDL::Event* event)
 {
-    SDL::trackCandidates& trackCandidatesInGPU = (*event->getTrackCandidates());
+    SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());

From 337b7b7168c4d796d19a7cf315b2127418a47ed8 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 15:47:01 -0400
Subject: [PATCH 23/44] move minidoublets to Alpaka memory

---
 SDL/Event.cu                  |  79 ++++++-------------
 SDL/Event.cuh                 |   7 +-
 SDL/MiniDoublet.cu            | 139 ---------------------------------
 SDL/MiniDoublet.cuh           | 143 ++++++++++++++++++++++++++++++----
 code/core/AccessHelper.cc     |   4 +-
 code/core/write_sdl_ntuple.cc |  16 ++--
 6 files changed, 169 insertions(+), 219 deletions(-)
 delete mode 100644 SDL/MiniDoublet.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 5a325624..21bfc6cc 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -59,16 +59,14 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
 SDL::Event::~Event()
 {
 #ifdef CACHE_ALLOC
-    if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
 #else
-    if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
 #endif
     if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
-    if(mdsInGPU != nullptr){cms::cuda::free_host(mdsInGPU);}
+    if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;}
     if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
     if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;}
     if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;}
@@ -85,21 +83,14 @@ SDL::Event::~Event()
     {
         delete rangesInCPU;
     }
-
     if(mdsInCPU != nullptr)
     {
-        delete[] mdsInCPU->anchorHitIndices;
-        delete[] mdsInCPU->nMDs;
-        delete mdsInCPU->nMemoryLocations;
-        delete[] mdsInCPU->totOccupancyMDs;
         delete mdsInCPU;
     }
-
     if(segmentsInCPU != nullptr)
     {
         delete segmentsInCPU;
     }
-
     if(tripletsInCPU != nullptr)
     {
         delete tripletsInCPU;
@@ -108,7 +99,6 @@ SDL::Event::~Event()
     {
         delete quintupletsInCPU;
     }
-
     if(pixelTripletsInCPU != nullptr)
     {
         delete[] pixelTripletsInCPU->tripletIndices;
@@ -122,7 +112,6 @@ SDL::Event::~Event()
         delete[] pixelTripletsInCPU->rPhiChiSquaredInwards;
         delete pixelTripletsInCPU;
     }
-
     if(pixelQuintupletsInCPU != nullptr)
     {
         delete[] pixelQuintupletsInCPU->pixelIndices;
@@ -136,12 +125,10 @@ SDL::Event::~Event()
         delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards;
         delete pixelQuintupletsInCPU;
     }
-
     if(trackCandidatesInCPU != nullptr)
     {
         delete trackCandidatesInCPU;
     }
-
     if(modulesInCPU != nullptr)
     {
         delete[] modulesInCPU->nLowerModules;
@@ -177,8 +164,6 @@ SDL::Event::~Event()
         delete[] modulesInCPUFull->r;
         delete[] modulesInCPUFull->isInverted;
         delete[] modulesInCPUFull->isLower;
-
-
         delete[] modulesInCPUFull->moduleType;
         delete[] modulesInCPUFull->moduleLayerType;
         delete[] modulesInCPUFull;
@@ -188,11 +173,9 @@ SDL::Event::~Event()
 void SDL::Event::resetEvent()
 {
 #ifdef CACHE_ALLOC
-    if(mdsInGPU){mdsInGPU->freeMemoryCache();}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
 #else
-    if(mdsInGPU){mdsInGPU->freeMemory(stream);}
     if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
     if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
 #endif
@@ -217,7 +200,7 @@ void SDL::Event::resetEvent()
     }
     if(hitsInGPU){delete hitsInGPU; delete hitsBuffers;
       hitsInGPU = nullptr;}
-    if(mdsInGPU){cms::cuda::free_host(mdsInGPU);
+    if(mdsInGPU){delete mdsInGPU; delete miniDoubletsBuffers;
       mdsInGPU = nullptr;}
     if(rangesInGPU){delete rangesInGPU; delete rangesBuffers;
       rangesInGPU = nullptr;}
@@ -246,9 +229,6 @@ void SDL::Event::resetEvent()
     }
     if(mdsInCPU != nullptr)
     {
-        delete[] mdsInCPU->anchorHitIndices;
-        delete[] mdsInCPU->nMDs;
-        delete[] mdsInCPU->totOccupancyMDs;
         delete mdsInCPU;
         mdsInCPU = nullptr;
     }
@@ -445,7 +425,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
 
     if(mdsInGPU == nullptr)
     {
-        mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
         unsigned int nTotalMDs;
         cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
 
@@ -465,9 +444,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
 
         cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
         cudaStreamSynchronize(stream);
-        nTotalMDs+= N_MAX_PIXEL_MD_PER_MODULES;
+        nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES;
 
-        createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
+        mdsInGPU = new SDL::miniDoublets();
+        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue);
+        mdsInGPU->setData(*miniDoubletsBuffers);
 
         cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
         cudaStreamSynchronize(stream);
@@ -661,11 +642,10 @@ void SDL::Event::createMiniDoublets()
 
     if(mdsInGPU == nullptr)
     {
-        mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
-        //FIXME: Add memory locations for pixel MDs
-        createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, stream);
+        mdsInGPU = new SDL::miniDoublets();
+        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue);
+        mdsInGPU->setData(*miniDoubletsBuffers);
     }
-    cudaStreamSynchronize(stream);
 
     int maxThreadsPerModule=0;
     int* module_hitRanges;
@@ -912,7 +892,6 @@ void SDL::Event::createTrackCandidates()
         *pixelQuintupletsInGPU));
 
     alpaka::enqueue(queue, crossCleanpT3Task);
-    alpaka::wait(queue);
 
     //adding objects
     Vec const threadsPerBlock_addpT3asTrackCandidatesInGPU(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(512));
@@ -930,7 +909,6 @@ void SDL::Event::createTrackCandidates()
         *rangesInGPU));
 
     alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlockRemoveDupQuints(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(32));
     Vec const blocksPerGridRemoveDupQuints(static_cast<Idx>(1), static_cast<Idx>(max(nEligibleModules/16,1)), static_cast<Idx>(max(nEligibleModules/32,1)));
@@ -944,7 +922,6 @@ void SDL::Event::createTrackCandidates()
         *rangesInGPU));
 
     alpaka::enqueue(queue, removeDupQuintupletsInGPUBeforeTCTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlock_crossCleanT5(static_cast<Idx>(32), static_cast<Idx>(1), static_cast<Idx>(32));
     Vec const blocksPerGrid_crossCleanT5(static_cast<Idx>((13296/32) + 1), static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS));
@@ -961,7 +938,6 @@ void SDL::Event::createTrackCandidates()
         *rangesInGPU));
 
     alpaka::enqueue(queue, crossCleanT5Task);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlock_addT5asTrackCandidateInGPU(static_cast<Idx>(1), static_cast<Idx>(8), static_cast<Idx>(128));
     Vec const blocksPerGrid_addT5asTrackCandidateInGPU(static_cast<Idx>(1), static_cast<Idx>(8), static_cast<Idx>(10));
@@ -977,7 +953,6 @@ void SDL::Event::createTrackCandidates()
         *rangesInGPU));
 
     alpaka::enqueue(queue, addT5asTrackCandidateInGPUTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlockCheckHitspLS(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridCheckHitspLS(static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS*4), static_cast<Idx>(MAX_BLOCKS/4));
@@ -992,7 +967,6 @@ void SDL::Event::createTrackCandidates()
         true));
 
     alpaka::enqueue(queue, checkHitspLSTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlock_crossCleanpLS(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(32));
     Vec const blocksPerGrid_crossCleanpLS(static_cast<Idx>(1), static_cast<Idx>(4), static_cast<Idx>(20));
@@ -1012,7 +986,6 @@ void SDL::Event::createTrackCandidates()
         *quintupletsInGPU));
 
     alpaka::enqueue(queue, crossCleanpLSTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlock_addpLSasTrackCandidateInGPU(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(384));
     Vec const blocksPerGrid_addpLSasTrackCandidateInGPU(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS));
@@ -1778,28 +1751,26 @@ SDL::objectRangesBuffer<alpaka::DevCpu>* SDL::Event::getRanges()
     return rangesInCPU;
 }
 
-SDL::miniDoublets* SDL::Event::getMiniDoublets()
+SDL::miniDoubletsBuffer<alpaka::DevCpu>* SDL::Event::getMiniDoublets()
 {
     if(mdsInCPU == nullptr)
     {
-        mdsInCPU = new SDL::miniDoublets;
-        mdsInCPU->nMDs = new int[nLowerModules+1];
+        // Get nMemoryLocations parameter to initialize host based mdsInCPU
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        alpaka::memcpy(queue, nMemLocal_buf, miniDoubletsBuffers->nMemoryLocations_buf, 1);
+        alpaka::wait(queue);
 
-        //compute memory locations
-        mdsInCPU->nMemoryLocations = new unsigned int;
-        cudaMemcpyAsync(mdsInCPU->nMemoryLocations, mdsInGPU->nMemoryLocations, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
-        mdsInCPU->totOccupancyMDs = new int[nLowerModules+1];
-
-        mdsInCPU->anchorHitIndices = new unsigned int[*(mdsInCPU->nMemoryLocations)];
-        mdsInCPU->outerHitIndices = new unsigned int[*(mdsInCPU->nMemoryLocations)];
-        mdsInCPU->dphichanges = new float[*(mdsInCPU->nMemoryLocations)];
-        cudaMemcpyAsync(mdsInCPU->anchorHitIndices, mdsInGPU->anchorHitIndices, *(mdsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(mdsInCPU->outerHitIndices, mdsInGPU->outerHitIndices, *(mdsInCPU->nMemoryLocations) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(mdsInCPU->dphichanges, mdsInGPU->dphichanges, *(mdsInCPU->nMemoryLocations) * sizeof(float), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(mdsInCPU->nMDs, mdsInGPU->nMDs, (nLowerModules+1) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(mdsInCPU->totOccupancyMDs, mdsInGPU->totOccupancyMDs, (nLowerModules+1) * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
+        mdsInCPU = new SDL::miniDoubletsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devHost, queue);
+        mdsInCPU->setData(*mdsInCPU);
+
+        *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemLocal;
+        alpaka::memcpy(queue, mdsInCPU->anchorHitIndices_buf, miniDoubletsBuffers->anchorHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, mdsInCPU->outerHitIndices_buf, miniDoubletsBuffers->outerHitIndices_buf, nMemLocal);
+        alpaka::memcpy(queue, mdsInCPU->dphichanges_buf, miniDoubletsBuffers->dphichanges_buf, nMemLocal);
+        alpaka::memcpy(queue, mdsInCPU->nMDs_buf, miniDoubletsBuffers->nMDs_buf, (nLowerModules+1));
+        alpaka::memcpy(queue, mdsInCPU->totOccupancyMDs_buf, miniDoubletsBuffers->totOccupancyMDs_buf, (nLowerModules+1));
+        alpaka::wait(queue);
     }
     return mdsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index bbdd93f1..31d3da3a 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -36,7 +36,7 @@ namespace SDL
         std::array<unsigned int, 6> n_quintuplets_by_layer_barrel_;
         std::array<unsigned int, 5> n_quintuplets_by_layer_endcap_;
 
-        //CUDA stuff
+        //Device stuff
         int dev;
         int nTotalSegments;
         struct objectRanges* rangesInGPU;
@@ -44,6 +44,7 @@ namespace SDL
         struct hits* hitsInGPU;
         struct hitsBuffer<Acc>* hitsBuffers;
         struct miniDoublets* mdsInGPU;
+        struct miniDoubletsBuffer<Acc>* miniDoubletsBuffers;
         struct segments* segmentsInGPU;
         struct segmentsBuffer<Acc>* segmentsBuffers;
         struct triplets* tripletsInGPU;
@@ -58,7 +59,7 @@ namespace SDL
         //CPU interface stuff
         objectRangesBuffer<alpaka::DevCpu>* rangesInCPU;
         hitsBuffer<alpaka::DevCpu>* hitsInCPU;
-        miniDoublets* mdsInCPU;
+        miniDoubletsBuffer<alpaka::DevCpu>* mdsInCPU;
         segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
         tripletsBuffer<alpaka::DevCpu>* tripletsInCPU;
         trackCandidatesBuffer<alpaka::DevCpu>* trackCandidatesInCPU;
@@ -138,7 +139,7 @@ namespace SDL
         objectRangesBuffer<alpaka::DevCpu>* getRanges();
         hitsBuffer<alpaka::DevCpu>* getHits();
         hitsBuffer<alpaka::DevCpu>* getHitsInCMSSW();
-        miniDoublets* getMiniDoublets();
+        miniDoubletsBuffer<alpaka::DevCpu>* getMiniDoublets();
         segmentsBuffer<alpaka::DevCpu>* getSegments() ;
         tripletsBuffer<alpaka::DevCpu>* getTriplets();
         quintupletsBuffer<alpaka::DevCpu>* getQuintuplets();
diff --git a/SDL/MiniDoublet.cu b/SDL/MiniDoublet.cu
deleted file mode 100644
index 3fd6d23a..00000000
--- a/SDL/MiniDoublet.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "MiniDoublet.cuh"
-
-//FIXME:Add memory locations for the pixel MDs here!
-void SDL::createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int nMemoryLocations, uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    mdsInGPU.anchorHitIndices = (unsigned int*)cms::cuda::allocate_device(dev,nMemoryLocations * 2 * sizeof(unsigned int), stream);
-    mdsInGPU.moduleIndices = (uint16_t*)cms::cuda::allocate_device(dev, nMemoryLocations * sizeof(uint16_t), stream);
-    mdsInGPU.dphichanges = (float*)cms::cuda::allocate_device(dev,nMemoryLocations*9*sizeof(float),stream);
-    mdsInGPU.nMDs = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream);
-    mdsInGPU.totOccupancyMDs = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) *sizeof(int),stream);
-    mdsInGPU.anchorX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 6 * sizeof(float), stream);
-    mdsInGPU.anchorHighEdgeX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 4 * sizeof(float), stream);
-    mdsInGPU.outerX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 6 * sizeof(float), stream);
-    mdsInGPU.outerHighEdgeX = (float*)cms::cuda::allocate_device(dev, nMemoryLocations * 4 * sizeof(float), stream);
-    mdsInGPU.nMemoryLocations = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
-#else
-    cudaMalloc(&mdsInGPU.anchorHitIndices, nMemoryLocations * 2 * sizeof(unsigned int));
-    cudaMalloc(&mdsInGPU.moduleIndices, nMemoryLocations * sizeof(uint16_t));
-    cudaMalloc(&mdsInGPU.dphichanges, nMemoryLocations *9* sizeof(float));
-    cudaMalloc(&mdsInGPU.nMDs, (nLowerModules + 1) * sizeof(int)); 
-    cudaMalloc(&mdsInGPU.totOccupancyMDs, (nLowerModules + 1) * sizeof(int)); 
-    cudaMalloc(&mdsInGPU.anchorX, nMemoryLocations * 6 * sizeof(float));
-    cudaMalloc(&mdsInGPU.anchorHighEdgeX, nMemoryLocations * 4 * sizeof(float));
-    cudaMalloc(&mdsInGPU.outerX, nMemoryLocations * 6 * sizeof(float));
-    cudaMalloc(&mdsInGPU.outerHighEdgeX, nMemoryLocations * 4 * sizeof(float));
-    cudaMalloc(&mdsInGPU.nMemoryLocations, sizeof(unsigned int));
-#endif
-    cudaMemsetAsync(mdsInGPU.nMDs,0, (nLowerModules + 1) *sizeof(int),stream);
-    cudaMemsetAsync(mdsInGPU.totOccupancyMDs,0, (nLowerModules + 1) *sizeof(int),stream);
-    cudaStreamSynchronize(stream);
-
-    mdsInGPU.outerHitIndices = mdsInGPU.anchorHitIndices + nMemoryLocations;
-    mdsInGPU.dzs  = mdsInGPU.dphichanges + nMemoryLocations;
-    mdsInGPU.dphis  = mdsInGPU.dphichanges + 2*nMemoryLocations;
-    mdsInGPU.shiftedXs  = mdsInGPU.dphichanges + 3*nMemoryLocations;
-    mdsInGPU.shiftedYs  = mdsInGPU.dphichanges + 4*nMemoryLocations;
-    mdsInGPU.shiftedZs  = mdsInGPU.dphichanges + 5*nMemoryLocations;
-    mdsInGPU.noShiftedDzs  = mdsInGPU.dphichanges + 6*nMemoryLocations;
-    mdsInGPU.noShiftedDphis  = mdsInGPU.dphichanges + 7*nMemoryLocations;
-    mdsInGPU.noShiftedDphiChanges  = mdsInGPU.dphichanges + 8*nMemoryLocations;
-
-    mdsInGPU.anchorY = mdsInGPU.anchorX + nMemoryLocations;
-    mdsInGPU.anchorZ = mdsInGPU.anchorX + 2 * nMemoryLocations;
-    mdsInGPU.anchorRt = mdsInGPU.anchorX + 3 * nMemoryLocations;
-    mdsInGPU.anchorPhi = mdsInGPU.anchorX + 4 * nMemoryLocations;
-    mdsInGPU.anchorEta = mdsInGPU.anchorX + 5 * nMemoryLocations;
-
-    mdsInGPU.anchorHighEdgeY = mdsInGPU.anchorHighEdgeX + nMemoryLocations;
-    mdsInGPU.anchorLowEdgeX = mdsInGPU.anchorHighEdgeX + 2 * nMemoryLocations;
-    mdsInGPU.anchorLowEdgeY = mdsInGPU.anchorHighEdgeX + 3 * nMemoryLocations;
-
-    mdsInGPU.outerY = mdsInGPU.outerX + nMemoryLocations;
-    mdsInGPU.outerZ = mdsInGPU.outerX + 2 * nMemoryLocations;
-    mdsInGPU.outerRt = mdsInGPU.outerX + 3 * nMemoryLocations;
-    mdsInGPU.outerPhi = mdsInGPU.outerX + 4 * nMemoryLocations;
-    mdsInGPU.outerEta = mdsInGPU.outerX + 5 * nMemoryLocations;
-
-    mdsInGPU.outerHighEdgeY = mdsInGPU.outerHighEdgeX + nMemoryLocations;
-    mdsInGPU.outerLowEdgeX = mdsInGPU.outerHighEdgeX + 2 * nMemoryLocations;
-    mdsInGPU.outerLowEdgeY = mdsInGPU.outerHighEdgeX + 3 * nMemoryLocations;
-}
-
-SDL::miniDoublets::miniDoublets()
-{
-    anchorHitIndices = nullptr;
-    outerHitIndices = nullptr;
-    moduleIndices = nullptr;
-    nMDs = nullptr;
-    totOccupancyMDs = nullptr;
-    dphichanges = nullptr;
-
-    dzs = nullptr;
-    dphis = nullptr;
-
-    shiftedXs = nullptr;
-    shiftedYs = nullptr;
-    shiftedZs = nullptr;
-    noShiftedDzs = nullptr;
-    noShiftedDphis = nullptr;
-    noShiftedDphiChanges = nullptr;
-    
-    anchorX = nullptr;
-    anchorY = nullptr;
-    anchorZ = nullptr;
-    anchorRt = nullptr;
-    anchorPhi = nullptr;
-    anchorEta = nullptr;
-    anchorHighEdgeX = nullptr;
-    anchorHighEdgeY = nullptr;
-    anchorLowEdgeX = nullptr;
-    anchorLowEdgeY = nullptr;
-    outerX = nullptr;
-    outerY = nullptr;
-    outerZ = nullptr;
-    outerRt = nullptr;
-    outerPhi = nullptr;
-    outerEta = nullptr;
-    outerHighEdgeX = nullptr;
-    outerHighEdgeY = nullptr;
-    outerLowEdgeX = nullptr;
-    outerLowEdgeY = nullptr;
-}
-
-SDL::miniDoublets::~miniDoublets()
-{
-}
-
-void SDL::miniDoublets::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,anchorHitIndices);
-    cms::cuda::free_device(dev, moduleIndices);
-    cms::cuda::free_device(dev,dphichanges);
-    cms::cuda::free_device(dev,nMDs);
-    cms::cuda::free_device(dev,totOccupancyMDs);
-    cms::cuda::free_device(dev, anchorX);
-    cms::cuda::free_device(dev, anchorHighEdgeX);
-    cms::cuda::free_device(dev, outerX);
-    cms::cuda::free_device(dev, outerHighEdgeX);
-    cms::cuda::free_device(dev, nMemoryLocations);
-}
-
-void SDL::miniDoublets::freeMemory(cudaStream_t stream)
-{
-    cudaFree(anchorHitIndices);
-    cudaFree(moduleIndices);
-    cudaFree(nMDs);
-    cudaFree(totOccupancyMDs);
-    cudaFree(dphichanges);
-    cudaFree(anchorX);
-    cudaFree(anchorHighEdgeX);
-    cudaFree(outerX);
-    cudaFree(outerHighEdgeX);
-    cudaFree(nMemoryLocations);
-}
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 9f723e2d..7b80cb28 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -29,7 +29,6 @@ namespace SDL
         float* noShiftedDphis; //if shifted module
         float* noShiftedDphiChanges; //if shifted module
 
-        //hit stuff
         float* anchorX;
         float* anchorY;
         float* anchorZ;
@@ -52,20 +51,138 @@ namespace SDL
         float* outerLowEdgeX;
         float* outerLowEdgeY;
 
-#ifdef CUT_VALUE_DEBUG
-        //CUT VALUES
-        float* dzCuts;
-        float* drtCuts;
-        float* drts;
-        float* miniCuts;
-#endif
-        miniDoublets();
-        ~miniDoublets();
-      	void freeMemory(cudaStream_t stream);
-      	void freeMemoryCache();
+        template<typename TBuf>
+        void setData(TBuf& mdsbuf)
+        {
+            nMemoryLocations = alpaka::getPtrNative(mdsbuf.nMemoryLocations_buf);
+            anchorHitIndices = alpaka::getPtrNative(mdsbuf.anchorHitIndices_buf);
+            outerHitIndices = alpaka::getPtrNative(mdsbuf.outerHitIndices_buf);
+            moduleIndices = alpaka::getPtrNative(mdsbuf.moduleIndices_buf);
+            nMDs = alpaka::getPtrNative(mdsbuf.nMDs_buf);
+            totOccupancyMDs = alpaka::getPtrNative(mdsbuf.totOccupancyMDs_buf);
+            dphichanges = alpaka::getPtrNative(mdsbuf.dphichanges_buf);
+            dzs = alpaka::getPtrNative(mdsbuf.dzs_buf);
+            dphis = alpaka::getPtrNative(mdsbuf.dphis_buf);
+            shiftedXs = alpaka::getPtrNative(mdsbuf.shiftedXs_buf);
+            shiftedYs = alpaka::getPtrNative(mdsbuf.shiftedYs_buf);
+            shiftedZs = alpaka::getPtrNative(mdsbuf.shiftedZs_buf);
+            noShiftedDzs = alpaka::getPtrNative(mdsbuf.noShiftedDzs_buf);
+            noShiftedDphis = alpaka::getPtrNative(mdsbuf.noShiftedDphis_buf);
+            noShiftedDphiChanges = alpaka::getPtrNative(mdsbuf.noShiftedDphiChanges_buf);
+            anchorX = alpaka::getPtrNative(mdsbuf.anchorX_buf);
+            anchorY = alpaka::getPtrNative(mdsbuf.anchorY_buf);
+            anchorZ = alpaka::getPtrNative(mdsbuf.anchorZ_buf);
+            anchorRt = alpaka::getPtrNative(mdsbuf.anchorRt_buf);
+            anchorPhi = alpaka::getPtrNative(mdsbuf.anchorPhi_buf);
+            anchorEta = alpaka::getPtrNative(mdsbuf.anchorEta_buf);
+            anchorHighEdgeX = alpaka::getPtrNative(mdsbuf.anchorHighEdgeX_buf);
+            anchorHighEdgeY = alpaka::getPtrNative(mdsbuf.anchorHighEdgeY_buf);
+            anchorLowEdgeX = alpaka::getPtrNative(mdsbuf.anchorLowEdgeX_buf);
+            anchorLowEdgeY = alpaka::getPtrNative(mdsbuf.anchorLowEdgeY_buf);
+            outerX = alpaka::getPtrNative(mdsbuf.outerX_buf);
+            outerY = alpaka::getPtrNative(mdsbuf.outerY_buf);
+            outerZ = alpaka::getPtrNative(mdsbuf.outerZ_buf);
+            outerRt = alpaka::getPtrNative(mdsbuf.outerRt_buf);
+            outerPhi = alpaka::getPtrNative(mdsbuf.outerPhi_buf);
+            outerEta = alpaka::getPtrNative(mdsbuf.outerEta_buf);
+            outerHighEdgeX = alpaka::getPtrNative(mdsbuf.outerHighEdgeX_buf);
+            outerHighEdgeY = alpaka::getPtrNative(mdsbuf.outerHighEdgeY_buf);
+            outerLowEdgeX = alpaka::getPtrNative(mdsbuf.outerLowEdgeX_buf);
+            outerLowEdgeY = alpaka::getPtrNative(mdsbuf.outerLowEdgeY_buf);
+        }
     };
 
-    void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
+    template<typename TAcc>
+    struct miniDoubletsBuffer : miniDoublets
+    {
+        Buf<TAcc, unsigned int> nMemoryLocations_buf;
+
+        Buf<TAcc, unsigned int> anchorHitIndices_buf;
+        Buf<TAcc, unsigned int> outerHitIndices_buf;
+        Buf<TAcc, uint16_t> moduleIndices_buf;
+        Buf<TAcc, int> nMDs_buf;
+        Buf<TAcc, int> totOccupancyMDs_buf;
+        Buf<TAcc, float> dphichanges_buf;
+
+        Buf<TAcc, float> dzs_buf;
+        Buf<TAcc, float> dphis_buf;
+
+        Buf<TAcc, float> shiftedXs_buf;
+        Buf<TAcc, float> shiftedYs_buf;
+        Buf<TAcc, float> shiftedZs_buf;
+        Buf<TAcc, float> noShiftedDzs_buf;
+        Buf<TAcc, float> noShiftedDphis_buf;
+        Buf<TAcc, float> noShiftedDphiChanges_buf;
+
+        Buf<TAcc, float> anchorX_buf;
+        Buf<TAcc, float> anchorY_buf;
+        Buf<TAcc, float> anchorZ_buf;
+        Buf<TAcc, float> anchorRt_buf;
+        Buf<TAcc, float> anchorPhi_buf;
+        Buf<TAcc, float> anchorEta_buf;
+        Buf<TAcc, float> anchorHighEdgeX_buf;
+        Buf<TAcc, float> anchorHighEdgeY_buf;
+        Buf<TAcc, float> anchorLowEdgeX_buf;
+        Buf<TAcc, float> anchorLowEdgeY_buf;
+
+        Buf<TAcc, float> outerX_buf;
+        Buf<TAcc, float> outerY_buf;
+        Buf<TAcc, float> outerZ_buf;
+        Buf<TAcc, float> outerRt_buf;
+        Buf<TAcc, float> outerPhi_buf;
+        Buf<TAcc, float> outerEta_buf;
+        Buf<TAcc, float> outerHighEdgeX_buf;
+        Buf<TAcc, float> outerHighEdgeY_buf;
+        Buf<TAcc, float> outerLowEdgeX_buf;
+        Buf<TAcc, float> outerLowEdgeY_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        miniDoubletsBuffer(unsigned int nMemoryLocations,
+                           uint16_t nLowerModules,
+                           unsigned int maxPixelMDs,
+                           TDevAcc const & devAccIn,
+                           TQueue& queue) :
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
+            anchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocations)),
+            outerHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocations)),
+            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocations)),
+            nMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            totOccupancyMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
+            dphichanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            dzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            dphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            shiftedXs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            shiftedYs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            shiftedZs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            noShiftedDzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            noShiftedDphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            noShiftedDphiChanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            anchorLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
+            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
+            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
+            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4))
+        {
+            alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1);
+            alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1);
+            alpaka::wait(queue);
+        }
+    };
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(struct miniDoublets& mdsInGPU, struct SDL::hits& hitsInGPU, struct modules& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t& lowerModuleIdx, float dz, float dPhi, float dPhiChange, float shiftedX, float shiftedY, float shiftedZ, float noShiftedDz, float noShiftedDphi, float noShiftedDPhiChange, unsigned int idx)
     {
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index bf6025db..60984428 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -29,7 +29,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> convertHitsToHi
 std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pLS)
 {
     SDL::segmentsBuffer<alpaka::DevCpu>& segments_ = *(event->getSegments());
-    SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoublets_ = *(event->getMiniDoublets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
@@ -77,7 +77,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 std::vector<unsigned int> getHitsFromMD(SDL::Event* event, unsigned int MD)
 {
-    SDL::miniDoublets& miniDoublets_ = *(event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoublets_ = *(event->getMiniDoublets());
     unsigned int hit_1 = miniDoublets_.anchorHitIndices[MD];
     unsigned int hit_2 = miniDoublets_.outerHitIndices [MD];
     return {hit_1, hit_2};
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 78abd3d2..3a580c15 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -560,7 +560,7 @@ void setGnnNtupleBranches(SDL::Event* event)
 {
     // Get relevant information
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
@@ -716,7 +716,7 @@ void setGnnNtupleBranches(SDL::Event* event)
 void setGnnNtupleMiniDoublet(SDL::Event* event, unsigned int MD)
 {
     // Get relevant information
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
 
     // Get the hit indices
@@ -1121,7 +1121,7 @@ void printHitMultiplicities(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printMiniDoubletMultiplicities(SDL::Event* event)
 {
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::modules& modulesInGPU = (*event->getModules());
 
     int nMiniDoublets = 0;
@@ -1150,7 +1150,7 @@ void printAllObjects(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void printMDs(SDL::Event* event)
 {
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
@@ -1174,7 +1174,7 @@ void printMDs(SDL::Event* event)
 void printLSs(SDL::Event* event)
 {
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
@@ -1207,7 +1207,7 @@ void printLSs(SDL::Event* event)
 void printpLSs(SDL::Event* event)
 {
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
@@ -1238,7 +1238,7 @@ void printT3s(SDL::Event* event)
 {
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
     SDL::modules& modulesInGPU = (*event->getModules());
     int nTriplets = 0;
@@ -1281,7 +1281,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
     SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::miniDoublets& miniDoubletsInGPU = (*event->getMiniDoublets());
+    SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::modules& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     //int nTrackCandidates = 0;

From ac2853735eba7aabc614496fd094c0e613bc6f0a Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 16:11:51 -0400
Subject: [PATCH 24/44] remove unused input to mds

---
 SDL/Event.cu        | 6 +++---
 SDL/MiniDoublet.cuh | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 21bfc6cc..80f4637b 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -447,7 +447,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES;
 
         mdsInGPU = new SDL::miniDoublets();
-        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue);
+        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, devAcc, queue);
         mdsInGPU->setData(*miniDoubletsBuffers);
 
         cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
@@ -643,7 +643,7 @@ void SDL::Event::createMiniDoublets()
     if(mdsInGPU == nullptr)
     {
         mdsInGPU = new SDL::miniDoublets();
-        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devAcc, queue);
+        miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, devAcc, queue);
         mdsInGPU->setData(*miniDoubletsBuffers);
     }
 
@@ -1761,7 +1761,7 @@ SDL::miniDoubletsBuffer<alpaka::DevCpu>* SDL::Event::getMiniDoublets()
         alpaka::wait(queue);
 
         unsigned int nMemLocal = *alpaka::getPtrNative(nMemLocal_buf);
-        mdsInCPU = new SDL::miniDoubletsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES, devHost, queue);
+        mdsInCPU = new SDL::miniDoubletsBuffer<alpaka::DevCpu>(nMemLocal, nLowerModules, devHost, queue);
         mdsInCPU->setData(*mdsInCPU);
 
         *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemLocal;
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 7b80cb28..9a77fe58 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -139,7 +139,6 @@ namespace SDL
         template<typename TQueue, typename TDevAcc>
         miniDoubletsBuffer(unsigned int nMemoryLocations,
                            uint16_t nLowerModules,
-                           unsigned int maxPixelMDs,
                            TDevAcc const & devAccIn,
                            TQueue& queue) :
             nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),

From 15341fdc7ddd377a3a13974d67ce18acaffd4c7a Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 16:51:13 -0400
Subject: [PATCH 25/44] fix overallocation bug

---
 SDL/MiniDoublet.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 9a77fe58..a75dcfb0 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -172,10 +172,10 @@ namespace SDL
             outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
             outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
             outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
-            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
-            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4)),
-            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations * 4))
+            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
+            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations))
         {
             alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1);
             alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1);

From 5bf92688e1362ec9dd4e16d4f76411bae073114f Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 8 Jun 2023 22:13:52 -0400
Subject: [PATCH 26/44] move pixels over to Alpaka memory

---
 SDL/Event.cu                  | 169 ++++++++----------------
 SDL/Event.cuh                 |  10 +-
 SDL/PixelTriplet.cu           | 242 ----------------------------------
 SDL/PixelTriplet.cuh          | 178 +++++++++++++++++++++++--
 code/core/AccessHelper.cc     |   6 +-
 code/core/write_sdl_ntuple.cc |   4 +-
 6 files changed, 227 insertions(+), 382 deletions(-)
 delete mode 100644 SDL/PixelTriplet.cu

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 80f4637b..c9a5871d 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -58,21 +58,14 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
 
 SDL::Event::~Event()
 {
-#ifdef CACHE_ALLOC
-    if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
-    if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
-#else
-    if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
-    if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
-#endif
     if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
     if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;}
     if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
     if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;}
     if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;}
     if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
-    if(pixelTripletsInGPU!= nullptr){cms::cuda::free_host(pixelTripletsInGPU);}
-    if(pixelQuintupletsInGPU!= nullptr){cms::cuda::free_host(pixelQuintupletsInGPU);}
+    if(pixelTripletsInGPU!= nullptr){delete pixelTripletsInGPU; delete pixelTripletsBuffers;}
+    if(pixelQuintupletsInGPU!= nullptr){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers;}
     if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;}
 
     if(hitsInCPU != nullptr)
@@ -101,28 +94,10 @@ SDL::Event::~Event()
     }
     if(pixelTripletsInCPU != nullptr)
     {
-        delete[] pixelTripletsInCPU->tripletIndices;
-        delete[] pixelTripletsInCPU->pixelSegmentIndices;
-        delete[] pixelTripletsInCPU->pixelRadius;
-        delete[] pixelTripletsInCPU->tripletRadius;
-        delete pixelTripletsInCPU->nPixelTriplets;
-        delete pixelTripletsInCPU->totOccupancyPixelTriplets;
-        delete[] pixelTripletsInCPU->rzChiSquared;
-        delete[] pixelTripletsInCPU->rPhiChiSquared;
-        delete[] pixelTripletsInCPU->rPhiChiSquaredInwards;
         delete pixelTripletsInCPU;
     }
     if(pixelQuintupletsInCPU != nullptr)
     {
-        delete[] pixelQuintupletsInCPU->pixelIndices;
-        delete[] pixelQuintupletsInCPU->T5Indices;
-        delete[] pixelQuintupletsInCPU->isDup;
-        delete[] pixelQuintupletsInCPU->score;
-        delete pixelQuintupletsInCPU->nPixelQuintuplets;
-        delete pixelQuintupletsInCPU->totOccupancyPixelQuintuplets;
-        delete[] pixelQuintupletsInCPU->rzChiSquared;
-        delete[] pixelQuintupletsInCPU->rPhiChiSquared;
-        delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards;
         delete pixelQuintupletsInCPU;
     }
     if(trackCandidatesInCPU != nullptr)
@@ -172,13 +147,6 @@ SDL::Event::~Event()
 
 void SDL::Event::resetEvent()
 {
-#ifdef CACHE_ALLOC
-    if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemoryCache();}
-    if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemoryCache();}
-#else
-    if(pixelQuintupletsInGPU){pixelQuintupletsInGPU->freeMemory(stream);}
-    if(pixelTripletsInGPU){pixelTripletsInGPU->freeMemory(stream);}
-#endif
     //reset the arrays
     for(int i = 0; i<6; i++)
     {
@@ -212,9 +180,9 @@ void SDL::Event::resetEvent()
       quintupletsInGPU = nullptr;}
     if(trackCandidatesInGPU){delete trackCandidatesInGPU; delete trackCandidatesBuffers;
       trackCandidatesInGPU = nullptr;}
-    if(pixelTripletsInGPU){cms::cuda::free_host(pixelTripletsInGPU);
+    if(pixelTripletsInGPU){delete pixelTripletsInGPU; delete pixelTripletsBuffers;
       pixelTripletsInGPU = nullptr;}
-    if(pixelQuintupletsInGPU){cms::cuda::free_host(pixelQuintupletsInGPU);
+    if(pixelQuintupletsInGPU){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers;
       pixelQuintupletsInGPU = nullptr;}
 
     if(hitsInCPU != nullptr)
@@ -249,29 +217,11 @@ void SDL::Event::resetEvent()
     }
     if(pixelTripletsInCPU != nullptr)
     {
-        delete[] pixelTripletsInCPU->tripletIndices;
-        delete[] pixelTripletsInCPU->pixelSegmentIndices;
-        delete[] pixelTripletsInCPU->pixelRadius;
-        delete[] pixelTripletsInCPU->tripletRadius;
-        delete pixelTripletsInCPU->nPixelTriplets;
-        delete pixelTripletsInCPU->totOccupancyPixelTriplets;
-        delete[] pixelTripletsInCPU->rzChiSquared;
-        delete[] pixelTripletsInCPU->rPhiChiSquared;
-        delete[] pixelTripletsInCPU->rPhiChiSquaredInwards;
         delete pixelTripletsInCPU;
         pixelTripletsInCPU = nullptr;
     }
     if(pixelQuintupletsInCPU != nullptr)
     {
-        delete[] pixelQuintupletsInCPU->pixelIndices;
-        delete[] pixelQuintupletsInCPU->T5Indices;
-        delete[] pixelQuintupletsInCPU->isDup;
-        delete[] pixelQuintupletsInCPU->score;
-        delete pixelQuintupletsInCPU->nPixelQuintuplets;
-        delete pixelQuintupletsInCPU->totOccupancyPixelQuintuplets;
-        delete[] pixelQuintupletsInCPU->rzChiSquared;
-        delete[] pixelQuintupletsInCPU->rPhiChiSquared;
-        delete[] pixelQuintupletsInCPU->rPhiChiSquaredInwards;
         delete pixelQuintupletsInCPU;
         pixelQuintupletsInCPU = nullptr;
     }
@@ -1007,11 +957,11 @@ void SDL::Event::createPixelTriplets()
 {
     if(pixelTripletsInGPU == nullptr)
     {
-        pixelTripletsInGPU = (SDL::pixelTriplets*)cms::cuda::allocate_host(sizeof(SDL::pixelTriplets), stream);
+        pixelTripletsInGPU = new SDL::pixelTriplets();
+        pixelTripletsBuffers = new SDL::pixelTripletsBuffer<Acc>(N_MAX_PIXEL_TRIPLETS, devAcc, queue);
+        pixelTripletsInGPU->setData(*pixelTripletsBuffers);
     }
 
-    createPixelTripletsInExplicitMemory(*pixelTripletsInGPU, N_MAX_PIXEL_TRIPLETS,stream);
-
     unsigned int pixelModuleIndex = nLowerModules;
     int* superbins;
     int8_t* pixelTypes;
@@ -1241,8 +1191,9 @@ void SDL::Event::createPixelQuintuplets()
 {
     if(pixelQuintupletsInGPU == nullptr)
     {
-        pixelQuintupletsInGPU = (SDL::pixelQuintuplets*)cms::cuda::allocate_host(sizeof(SDL::pixelQuintuplets), stream);
-        createPixelQuintupletsInExplicitMemory(*pixelQuintupletsInGPU, N_MAX_PIXEL_QUINTUPLETS,stream);
+        pixelQuintupletsInGPU = new SDL::pixelQuintuplets();
+        pixelQuintupletsBuffers = new SDL::pixelQuintupletsBuffer<Acc>(N_MAX_PIXEL_QUINTUPLETS, devAcc, queue);
+        pixelQuintupletsInGPU->setData(*pixelQuintupletsBuffers);
     }
     if(trackCandidatesInGPU == nullptr)
     {
@@ -1880,76 +1831,60 @@ SDL::quintupletsBuffer<alpaka::DevCpu>* SDL::Event::getQuintuplets()
     return quintupletsInCPU;
 }
 
-SDL::pixelTriplets* SDL::Event::getPixelTriplets()
+SDL::pixelTripletsBuffer<alpaka::DevCpu>* SDL::Event::getPixelTriplets()
 {
     if(pixelTripletsInCPU == nullptr)
     {
-        pixelTripletsInCPU = new SDL::pixelTriplets;
+        // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
+        auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+        alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
+        alpaka::wait(queue);
 
-        pixelTripletsInCPU->nPixelTriplets = new int;
-        pixelTripletsInCPU->totOccupancyPixelTriplets = new int;
-        cudaMemcpyAsync(pixelTripletsInCPU->nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->totOccupancyPixelTriplets, pixelTripletsInGPU->totOccupancyPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        unsigned int nPixelTriplets = *(pixelTripletsInCPU->nPixelTriplets);
-        pixelTripletsInCPU->tripletIndices = new unsigned int[nPixelTriplets];
-        pixelTripletsInCPU->pixelSegmentIndices = new unsigned int[nPixelTriplets];
-        pixelTripletsInCPU->pixelRadius = new FPX[nPixelTriplets];
-        pixelTripletsInCPU->tripletRadius = new FPX[nPixelTriplets];
-        pixelTripletsInCPU->isDup = new bool[nPixelTriplets];
-        pixelTripletsInCPU->eta = new  FPX[nPixelTriplets];
-        pixelTripletsInCPU->phi = new  FPX[nPixelTriplets];
-        pixelTripletsInCPU->score =new FPX[nPixelTriplets];
-        pixelTripletsInCPU->rzChiSquared = new float[nPixelTriplets];
-        pixelTripletsInCPU->rPhiChiSquared = new float[nPixelTriplets];
-        pixelTripletsInCPU->rPhiChiSquaredInwards = new float[nPixelTriplets];
-
-        cudaMemcpyAsync(pixelTripletsInCPU->rzChiSquared, pixelTripletsInGPU->rzChiSquared, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->rPhiChiSquared, pixelTripletsInGPU->rPhiChiSquared, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->rPhiChiSquaredInwards, pixelTripletsInGPU->rPhiChiSquaredInwards, nPixelTriplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-
-        cudaMemcpyAsync(pixelTripletsInCPU->tripletIndices, pixelTripletsInGPU->tripletIndices, nPixelTriplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->pixelSegmentIndices, pixelTripletsInGPU->pixelSegmentIndices, nPixelTriplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->pixelRadius, pixelTripletsInGPU->pixelRadius, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->tripletRadius, pixelTripletsInGPU->tripletRadius, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->isDup, pixelTripletsInGPU->isDup, nPixelTriplets * sizeof(bool), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->eta, pixelTripletsInGPU->eta, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->phi, pixelTripletsInGPU->phi, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelTripletsInCPU->score, pixelTripletsInGPU->score, nPixelTriplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf);
+        pixelTripletsInCPU = new SDL::pixelTripletsBuffer<alpaka::DevCpu>(nPixelTriplets, devHost, queue);
+        pixelTripletsInCPU->setData(*pixelTripletsInCPU);
+
+        *alpaka::getPtrNative(pixelTripletsInCPU->nPixelTriplets_buf) = nPixelTriplets;
+        alpaka::memcpy(queue, pixelTripletsInCPU->totOccupancyPixelTriplets_buf, pixelTripletsBuffers->totOccupancyPixelTriplets_buf, 1);
+        alpaka::memcpy(queue, pixelTripletsInCPU->rzChiSquared_buf, pixelTripletsBuffers->rzChiSquared_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->rPhiChiSquared_buf, pixelTripletsBuffers->rPhiChiSquared_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->rPhiChiSquaredInwards_buf, pixelTripletsBuffers->rPhiChiSquaredInwards_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->tripletIndices_buf, pixelTripletsBuffers->tripletIndices_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->pixelSegmentIndices_buf, pixelTripletsBuffers->pixelSegmentIndices_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->pixelRadius_buf, pixelTripletsBuffers->pixelRadius_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->tripletRadius_buf, pixelTripletsBuffers->tripletRadius_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->isDup_buf, pixelTripletsBuffers->isDup_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->eta_buf, pixelTripletsBuffers->eta_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->phi_buf, pixelTripletsBuffers->phi_buf, nPixelTriplets);
+        alpaka::memcpy(queue, pixelTripletsInCPU->score_buf, pixelTripletsBuffers->score_buf, nPixelTriplets);
+        alpaka::wait(queue);
     }
     return pixelTripletsInCPU;
 }
 
-SDL::pixelQuintuplets* SDL::Event::getPixelQuintuplets()
+SDL::pixelQuintupletsBuffer<alpaka::DevCpu>* SDL::Event::getPixelQuintuplets()
 {
     if(pixelQuintupletsInCPU == nullptr)
     {
-        pixelQuintupletsInCPU = new SDL::pixelQuintuplets;
+        // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
+        auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+        alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
+        alpaka::wait(queue);
 
-        pixelQuintupletsInCPU->nPixelQuintuplets = new int;
-        pixelQuintupletsInCPU->totOccupancyPixelQuintuplets = new int;
-        cudaMemcpyAsync(pixelQuintupletsInCPU->nPixelQuintuplets, pixelQuintupletsInGPU->nPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->totOccupancyPixelQuintuplets, pixelQuintupletsInGPU->totOccupancyPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
-        int nPixelQuintuplets = *(pixelQuintupletsInCPU->nPixelQuintuplets);
-
-        pixelQuintupletsInCPU->pixelIndices = new unsigned int[nPixelQuintuplets];
-        pixelQuintupletsInCPU->T5Indices = new unsigned int[nPixelQuintuplets];
-        pixelQuintupletsInCPU->isDup = new bool[nPixelQuintuplets];
-        pixelQuintupletsInCPU->score = new FPX[nPixelQuintuplets];
-        pixelQuintupletsInCPU->rzChiSquared = new float[nPixelQuintuplets];
-        pixelQuintupletsInCPU->rPhiChiSquared = new float[nPixelQuintuplets];
-        pixelQuintupletsInCPU->rPhiChiSquaredInwards = new float[nPixelQuintuplets];
-
-        cudaMemcpyAsync(pixelQuintupletsInCPU->rzChiSquared, pixelQuintupletsInGPU->rzChiSquared, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->rPhiChiSquared, pixelQuintupletsInGPU->rPhiChiSquared, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->rPhiChiSquaredInwards, pixelQuintupletsInGPU->rPhiChiSquaredInwards, nPixelQuintuplets * sizeof(float), cudaMemcpyDeviceToHost, stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->pixelIndices, pixelQuintupletsInGPU->pixelIndices, nPixelQuintuplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->T5Indices, pixelQuintupletsInGPU->T5Indices, nPixelQuintuplets * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->isDup, pixelQuintupletsInGPU->isDup, nPixelQuintuplets * sizeof(bool), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(pixelQuintupletsInCPU->score, pixelQuintupletsInGPU->score, nPixelQuintuplets * sizeof(FPX), cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf);
+        pixelQuintupletsInCPU = new SDL::pixelQuintupletsBuffer<alpaka::DevCpu>(nPixelQuintuplets, devHost, queue);
+        pixelQuintupletsInCPU->setData(*pixelQuintupletsInCPU);
+
+        *alpaka::getPtrNative(pixelQuintupletsInCPU->nPixelQuintuplets_buf) = nPixelQuintuplets;
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->totOccupancyPixelQuintuplets_buf, pixelQuintupletsBuffers->totOccupancyPixelQuintuplets_buf, 1);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->rzChiSquared_buf, pixelQuintupletsBuffers->rzChiSquared_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->rPhiChiSquared_buf, pixelQuintupletsBuffers->rPhiChiSquared_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->rPhiChiSquaredInwards_buf, pixelQuintupletsBuffers->rPhiChiSquaredInwards_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->pixelIndices_buf, pixelQuintupletsBuffers->pixelIndices_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->T5Indices_buf, pixelQuintupletsBuffers->T5Indices_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->isDup_buf, pixelQuintupletsBuffers->isDup_buf, nPixelQuintuplets);
+        alpaka::memcpy(queue, pixelQuintupletsInCPU->score_buf, pixelQuintupletsBuffers->score_buf, nPixelQuintuplets);
+        alpaka::wait(queue);
     }
     return pixelQuintupletsInCPU;
 }
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 31d3da3a..b512b469 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -54,7 +54,9 @@ namespace SDL
         struct trackCandidates* trackCandidatesInGPU;
         struct trackCandidatesBuffer<Acc>* trackCandidatesBuffers;
         struct pixelTriplets* pixelTripletsInGPU;
+        struct pixelTripletsBuffer<Acc>* pixelTripletsBuffers;
         struct pixelQuintuplets* pixelQuintupletsInGPU;
+        struct pixelQuintupletsBuffer<Acc>* pixelQuintupletsBuffers;
 
         //CPU interface stuff
         objectRangesBuffer<alpaka::DevCpu>* rangesInCPU;
@@ -66,8 +68,8 @@ namespace SDL
         modules* modulesInCPU;
         modules* modulesInCPUFull;
         quintupletsBuffer<alpaka::DevCpu>* quintupletsInCPU;
-        pixelTriplets* pixelTripletsInCPU;
-        pixelQuintuplets* pixelQuintupletsInCPU;
+        pixelTripletsBuffer<alpaka::DevCpu>* pixelTripletsInCPU;
+        pixelQuintupletsBuffer<alpaka::DevCpu>* pixelQuintupletsInCPU;
 
         int* superbinCPU;
         int8_t* pixelTypeCPU;
@@ -145,10 +147,10 @@ namespace SDL
         quintupletsBuffer<alpaka::DevCpu>* getQuintuplets();
         trackCandidatesBuffer<alpaka::DevCpu>* getTrackCandidates();
         trackCandidatesBuffer<alpaka::DevCpu>* getTrackCandidatesInCMSSW();
-        pixelTriplets* getPixelTriplets();
+        pixelTripletsBuffer<alpaka::DevCpu>* getPixelTriplets();
+        pixelQuintupletsBuffer<alpaka::DevCpu>* getPixelQuintuplets();
         modules* getModules();
         modules* getFullModules();
-        pixelQuintuplets* getPixelQuintuplets();
     };
 
     //global stuff
diff --git a/SDL/PixelTriplet.cu b/SDL/PixelTriplet.cu
deleted file mode 100644
index dc0a2496..00000000
--- a/SDL/PixelTriplet.cu
+++ /dev/null
@@ -1,242 +0,0 @@
-#include "PixelTriplet.cuh"
-
-SDL::pixelTriplets::pixelTriplets()
-{
-    pixelSegmentIndices = nullptr;
-    tripletIndices = nullptr;
-    nPixelTriplets = nullptr;
-    totOccupancyPixelTriplets = nullptr;
-    pixelRadius = nullptr;
-    tripletRadius = nullptr;
-    pt = nullptr;
-    isDup = nullptr;
-    partOfPT5 = nullptr;
-    centerX = nullptr;
-    centerY = nullptr;
-    hitIndices = nullptr;
-    lowerModuleIndices = nullptr;
-    logicalLayers = nullptr;
-    rzChiSquared = nullptr;
-    rPhiChiSquared = nullptr;
-    rPhiChiSquaredInwards = nullptr;
-}
-
-void SDL::pixelTriplets::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,pixelSegmentIndices);
-    cms::cuda::free_device(dev,tripletIndices);
-    cms::cuda::free_device(dev,nPixelTriplets);
-    cms::cuda::free_device(dev,totOccupancyPixelTriplets);
-    cms::cuda::free_device(dev,pixelRadius);
-    cms::cuda::free_device(dev,tripletRadius);
-    cms::cuda::free_device(dev,pt);
-    cms::cuda::free_device(dev,isDup);
-    cms::cuda::free_device(dev,partOfPT5);
-    cms::cuda::free_device(dev, centerX);
-    cms::cuda::free_device(dev, centerY);
-    cms::cuda::free_device(dev, hitIndices);
-    cms::cuda::free_device(dev, logicalLayers);
-    cms::cuda::free_device(dev, lowerModuleIndices);
-    cms::cuda::free_device(dev, rPhiChiSquared);
-    cms::cuda::free_device(dev, rPhiChiSquaredInwards);
-    cms::cuda::free_device(dev, rzChiSquared);
-}
-
-void SDL::pixelTriplets::freeMemory(cudaStream_t stream)
-{
-    cudaFree(pixelSegmentIndices);
-    cudaFree(tripletIndices);
-    cudaFree(nPixelTriplets);
-    cudaFree(totOccupancyPixelTriplets);
-    cudaFree(pixelRadius);
-    cudaFree(tripletRadius);
-    cudaFree(pt);
-    cudaFree(isDup);
-    cudaFree(partOfPT5);
-    cudaFree(centerX);
-    cudaFree(centerY);
-    cudaFree(logicalLayers);
-    cudaFree(hitIndices);
-    cudaFree(lowerModuleIndices);
-    cudaFree(rPhiChiSquared);
-    cudaFree(rPhiChiSquaredInwards);
-    cudaFree(rzChiSquared);
-}
-
-SDL::pixelTriplets::~pixelTriplets()
-{
-}
-
-void SDL::createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsInGPU, unsigned int maxPixelTriplets, cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    pixelTripletsInGPU.pixelSegmentIndices       =(unsigned int*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(unsigned int),stream);
-    pixelTripletsInGPU.tripletIndices            =(unsigned int*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(unsigned int),stream);
-    pixelTripletsInGPU.nPixelTriplets            =(int*)cms::cuda::allocate_device(dev,sizeof(int),stream);
-    pixelTripletsInGPU.totOccupancyPixelTriplets =(int*)cms::cuda::allocate_device(dev,sizeof(int),stream);
-    pixelTripletsInGPU.pixelRadius               =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(FPX),stream);
-    pixelTripletsInGPU.tripletRadius             =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(FPX),stream);
-    pixelTripletsInGPU.pt                        =(FPX*)cms::cuda::allocate_device(dev,maxPixelTriplets * 6*sizeof(FPX),stream);
-    pixelTripletsInGPU.isDup                     =(bool*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(bool),stream);
-    pixelTripletsInGPU.partOfPT5                 =(bool*)cms::cuda::allocate_device(dev,maxPixelTriplets * sizeof(bool),stream);
-    pixelTripletsInGPU.centerX                   = (FPX*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(FPX), stream);
-    pixelTripletsInGPU.centerY                   = (FPX*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(FPX), stream);
-    pixelTripletsInGPU.lowerModuleIndices        = (uint16_t*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(uint16_t) * 5, stream);
-    pixelTripletsInGPU.hitIndices                = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(unsigned int) * 10, stream);
-    pixelTripletsInGPU.logicalLayers             = (uint8_t*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(uint8_t) * 5, stream);
-
-    pixelTripletsInGPU.rPhiChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream);
-    pixelTripletsInGPU.rPhiChiSquaredInwards = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream);
-    pixelTripletsInGPU.rzChiSquared = (float*)cms::cuda::allocate_device(dev, maxPixelTriplets * sizeof(float), stream);
-#else
-    cudaMalloc(&pixelTripletsInGPU.pixelSegmentIndices, maxPixelTriplets * sizeof(unsigned int));
-    cudaMalloc(&pixelTripletsInGPU.tripletIndices, maxPixelTriplets * sizeof(unsigned int));
-    cudaMalloc(&pixelTripletsInGPU.nPixelTriplets, sizeof(int));
-    cudaMalloc(&pixelTripletsInGPU.totOccupancyPixelTriplets, sizeof(int));
-    cudaMalloc(&pixelTripletsInGPU.pixelRadius, maxPixelTriplets * sizeof(FPX));
-    cudaMalloc(&pixelTripletsInGPU.tripletRadius, maxPixelTriplets * sizeof(FPX));
-    cudaMalloc(&pixelTripletsInGPU.pt, maxPixelTriplets * 6*sizeof(FPX));
-    cudaMalloc(&pixelTripletsInGPU.isDup, maxPixelTriplets * sizeof(bool));
-    cudaMalloc(&pixelTripletsInGPU.partOfPT5, maxPixelTriplets * sizeof(bool));
-    cudaMalloc(&pixelTripletsInGPU.centerX, maxPixelTriplets * sizeof(FPX));
-    cudaMalloc(&pixelTripletsInGPU.centerY, maxPixelTriplets * sizeof(FPX));
-    cudaMalloc(&pixelTripletsInGPU.logicalLayers, maxPixelTriplets * sizeof(uint8_t) * 5);
-    cudaMalloc(&pixelTripletsInGPU.hitIndices, maxPixelTriplets * sizeof(unsigned int) * 10);
-    cudaMalloc(&pixelTripletsInGPU.lowerModuleIndices, maxPixelTriplets * sizeof(uint16_t) * 5);
-    cudaMalloc(&pixelTripletsInGPU.rPhiChiSquared, maxPixelTriplets * sizeof(float));
-    cudaMalloc(&pixelTripletsInGPU.rPhiChiSquaredInwards, maxPixelTriplets * sizeof(float));
-    cudaMalloc(&pixelTripletsInGPU.rzChiSquared, maxPixelTriplets * sizeof(float));
-#endif
-    cudaMemsetAsync(pixelTripletsInGPU.nPixelTriplets, 0, sizeof(int),stream);
-    cudaMemsetAsync(pixelTripletsInGPU.totOccupancyPixelTriplets, 0, sizeof(int),stream);
-    cudaMemsetAsync(pixelTripletsInGPU.partOfPT5, 0, maxPixelTriplets*sizeof(bool),stream);
-    cudaStreamSynchronize(stream);
-
-    pixelTripletsInGPU.eta = pixelTripletsInGPU.pt + maxPixelTriplets;
-    pixelTripletsInGPU.phi = pixelTripletsInGPU.pt + maxPixelTriplets * 2;
-    pixelTripletsInGPU.eta_pix = pixelTripletsInGPU.pt + maxPixelTriplets *3;
-    pixelTripletsInGPU.phi_pix = pixelTripletsInGPU.pt + maxPixelTriplets * 4;
-    pixelTripletsInGPU.score = pixelTripletsInGPU.pt + maxPixelTriplets * 5;
-}
-
-SDL::pixelQuintuplets::pixelQuintuplets()
-{
-    pixelIndices = nullptr;
-    T5Indices = nullptr;
-    nPixelQuintuplets = nullptr;
-    totOccupancyPixelQuintuplets = nullptr;
-    isDup = nullptr;
-    score = nullptr;
-    pixelRadius = nullptr;
-    quintupletRadius = nullptr;
-    centerX = nullptr;
-    centerY = nullptr;
-    logicalLayers = nullptr;
-    hitIndices = nullptr;
-    lowerModuleIndices = nullptr;
-}
-
-SDL::pixelQuintuplets::~pixelQuintuplets()
-{
-}
-
-void SDL::pixelQuintuplets::freeMemoryCache()
-{
-    int dev;
-    cudaGetDevice(&dev);
-    cms::cuda::free_device(dev,pixelIndices);
-    cms::cuda::free_device(dev,T5Indices);
-    cms::cuda::free_device(dev,nPixelQuintuplets);
-    cms::cuda::free_device(dev,totOccupancyPixelQuintuplets);
-    cms::cuda::free_device(dev,isDup);
-    cms::cuda::free_device(dev,score);
-    cms::cuda::free_device(dev,eta);
-    cms::cuda::free_device(dev,phi);
-    cms::cuda::free_device(dev, hitIndices);
-    cms::cuda::free_device(dev, logicalLayers);
-    cms::cuda::free_device(dev, lowerModuleIndices);
-    cms::cuda::free_device(dev, centerX);
-    cms::cuda::free_device(dev, centerY);
-    cms::cuda::free_device(dev, pixelRadius);
-    cms::cuda::free_device(dev, quintupletRadius);
-    cms::cuda::free_device(dev, rzChiSquared);
-    cms::cuda::free_device(dev, rPhiChiSquared);
-    cms::cuda::free_device(dev, rPhiChiSquaredInwards);
-}
-
-void SDL::pixelQuintuplets::freeMemory(cudaStream_t stream)
-{
-    cudaFree(pixelIndices);
-    cudaFree(T5Indices);
-    cudaFree(nPixelQuintuplets);
-    cudaFree(totOccupancyPixelQuintuplets);
-    cudaFree(isDup);
-    cudaFree(score);
-    cudaFree(eta);
-    cudaFree(phi);
-
-    cudaFree(logicalLayers);
-    cudaFree(hitIndices);
-    cudaFree(lowerModuleIndices);
-    cudaFree(pixelRadius);
-    cudaFree(quintupletRadius);
-    cudaFree(centerX);
-    cudaFree(centerY);
-    cudaFree(rzChiSquared);
-    cudaFree(rPhiChiSquared);
-    cudaFree(rPhiChiSquaredInwards);
-    cudaStreamSynchronize(stream);
-}
-
-void SDL::createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets,cudaStream_t stream)
-{
-#ifdef CACHE_ALLOC
-    int dev;
-    cudaGetDevice(&dev);
-    pixelQuintupletsInGPU.pixelIndices        = (unsigned int*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(unsigned int),stream);
-    pixelQuintupletsInGPU.T5Indices           = (unsigned int*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(unsigned int),stream);
-    pixelQuintupletsInGPU.nPixelQuintuplets   = (int*)cms::cuda::allocate_device(dev,sizeof(int),stream);
-    pixelQuintupletsInGPU.totOccupancyPixelQuintuplets   = (int*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream);
-    pixelQuintupletsInGPU.isDup               = (bool*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(bool),stream);
-    pixelQuintupletsInGPU.score               = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream);
-    pixelQuintupletsInGPU.eta                 = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream);
-    pixelQuintupletsInGPU.phi                 = (FPX*)cms::cuda::allocate_device(dev,maxPixelQuintuplets * sizeof(FPX),stream);
-    pixelQuintupletsInGPU.hitIndices = (unsigned int*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 14 * sizeof(unsigned int), stream);
-    pixelQuintupletsInGPU.logicalLayers = (uint8_t*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 7 * sizeof(uint8_t), stream);
-    pixelQuintupletsInGPU.lowerModuleIndices = (uint16_t*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * 7 * sizeof(uint16_t), stream);
-    pixelQuintupletsInGPU.centerX          = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream);
-    pixelQuintupletsInGPU.centerY          = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream);
-    pixelQuintupletsInGPU.pixelRadius      = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream);
-    pixelQuintupletsInGPU.quintupletRadius = (FPX*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(FPX), stream);
-    pixelQuintupletsInGPU.rzChiSquared          = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream);
-    pixelQuintupletsInGPU.rPhiChiSquared      = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream);
-    pixelQuintupletsInGPU.rPhiChiSquaredInwards = (float*)cms::cuda::allocate_device(dev, maxPixelQuintuplets * sizeof(float), stream);
-#else
-    cudaMalloc(&pixelQuintupletsInGPU.pixelIndices, maxPixelQuintuplets * sizeof(unsigned int));
-    cudaMalloc(&pixelQuintupletsInGPU.T5Indices, maxPixelQuintuplets * sizeof(unsigned int));
-    cudaMalloc(&pixelQuintupletsInGPU.nPixelQuintuplets, sizeof(int));
-    cudaMalloc(&pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, sizeof(int));
-    cudaMalloc(&pixelQuintupletsInGPU.isDup, maxPixelQuintuplets * sizeof(bool));
-    cudaMalloc(&pixelQuintupletsInGPU.score, maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.eta  , maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.phi  , maxPixelQuintuplets * sizeof(FPX));
-
-    cudaMalloc(&pixelQuintupletsInGPU.logicalLayers, maxPixelQuintuplets * 7 *sizeof(uint8_t));
-    cudaMalloc(&pixelQuintupletsInGPU.hitIndices, maxPixelQuintuplets * 14 * sizeof(unsigned int));
-    cudaMalloc(&pixelQuintupletsInGPU.lowerModuleIndices, maxPixelQuintuplets * 7 * sizeof(uint16_t));
-    cudaMalloc(&pixelQuintupletsInGPU.pixelRadius, maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.quintupletRadius, maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.centerX, maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.centerY, maxPixelQuintuplets * sizeof(FPX));
-    cudaMalloc(&pixelQuintupletsInGPU.rzChiSquared, maxPixelQuintuplets * sizeof(unsigned int));
-    cudaMalloc(&pixelQuintupletsInGPU.rPhiChiSquared, maxPixelQuintuplets * sizeof(unsigned int));
-    cudaMalloc(&pixelQuintupletsInGPU.rPhiChiSquaredInwards, maxPixelQuintuplets * sizeof(unsigned int));
-#endif
-    cudaMemsetAsync(pixelQuintupletsInGPU.nPixelQuintuplets, 0, sizeof(int),stream);
-    cudaMemsetAsync(pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, 0, sizeof(int),stream);
-    cudaStreamSynchronize(stream);
-}
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 422d8959..c40f8283 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -11,12 +11,13 @@
 
 namespace SDL
 {
-    struct pixelTriplets //one pixel segment, one outer tracker triplet!
+    // One pixel segment, one outer tracker triplet!
+    struct pixelTriplets
     {
         unsigned int* pixelSegmentIndices;         
         unsigned int* tripletIndices;
-        int* nPixelTriplets; //size 1
-        int* totOccupancyPixelTriplets; //size 1
+        int* nPixelTriplets;
+        int* totOccupancyPixelTriplets;
 
         float* pixelRadiusError;
         float* rPhiChiSquared;
@@ -40,13 +41,96 @@ namespace SDL
         FPX* centerX;
         FPX* centerY;
 
-        pixelTriplets();
-        ~pixelTriplets();
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
+        template<typename TBuff>
+        void setData(TBuff& pixelTripletsBuffer)
+        {
+            pixelSegmentIndices = alpaka::getPtrNative(pixelTripletsBuffer.pixelSegmentIndices_buf);
+            tripletIndices = alpaka::getPtrNative(pixelTripletsBuffer.tripletIndices_buf);
+            nPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.nPixelTriplets_buf);
+            totOccupancyPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.totOccupancyPixelTriplets_buf);
+            pixelRadius = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadius_buf);
+            tripletRadius = alpaka::getPtrNative(pixelTripletsBuffer.tripletRadius_buf);
+            pt = alpaka::getPtrNative(pixelTripletsBuffer.pt_buf);
+            eta = alpaka::getPtrNative(pixelTripletsBuffer.eta_buf);
+            phi = alpaka::getPtrNative(pixelTripletsBuffer.phi_buf);
+            eta_pix = alpaka::getPtrNative(pixelTripletsBuffer.eta_pix_buf);
+            phi_pix = alpaka::getPtrNative(pixelTripletsBuffer.phi_pix_buf);
+            score = alpaka::getPtrNative(pixelTripletsBuffer.score_buf);
+            isDup = alpaka::getPtrNative(pixelTripletsBuffer.isDup_buf);
+            partOfPT5 = alpaka::getPtrNative(pixelTripletsBuffer.partOfPT5_buf);
+            logicalLayers = alpaka::getPtrNative(pixelTripletsBuffer.logicalLayers_buf);
+            hitIndices = alpaka::getPtrNative(pixelTripletsBuffer.hitIndices_buf);
+            lowerModuleIndices = alpaka::getPtrNative(pixelTripletsBuffer.lowerModuleIndices_buf);
+            centerX = alpaka::getPtrNative(pixelTripletsBuffer.centerX_buf);
+            centerY = alpaka::getPtrNative(pixelTripletsBuffer.centerY_buf);
+            pixelRadiusError = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadiusError_buf);
+            rPhiChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquared_buf);
+            rPhiChiSquaredInwards = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquaredInwards_buf);
+            rzChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rzChiSquared_buf);
+        }
     };
 
-    void createPixelTripletsInExplicitMemory(struct pixelTriplets& pixelTripletsinGPU, unsigned int maxPixelTriplets, cudaStream_t stream);
+    template<typename TAcc>
+    struct pixelTripletsBuffer : pixelTriplets
+    {
+        Buf<TAcc, unsigned int> pixelSegmentIndices_buf;
+        Buf<TAcc, unsigned int> tripletIndices_buf;
+        Buf<TAcc, int> nPixelTriplets_buf;
+        Buf<TAcc, int> totOccupancyPixelTriplets_buf;
+        Buf<TAcc, FPX> pixelRadius_buf;
+        Buf<TAcc, FPX> tripletRadius_buf;
+        Buf<TAcc, FPX> pt_buf;
+        Buf<TAcc, FPX> eta_buf;
+        Buf<TAcc, FPX> phi_buf;
+        Buf<TAcc, FPX> eta_pix_buf;
+        Buf<TAcc, FPX> phi_pix_buf;
+        Buf<TAcc, FPX> score_buf;
+        Buf<TAcc, bool> isDup_buf;
+        Buf<TAcc, bool> partOfPT5_buf;
+        Buf<TAcc, uint8_t> logicalLayers_buf;
+        Buf<TAcc, unsigned int> hitIndices_buf;
+        Buf<TAcc, uint16_t> lowerModuleIndices_buf;
+        Buf<TAcc, FPX> centerX_buf;
+        Buf<TAcc, FPX> centerY_buf;
+        Buf<TAcc, float> pixelRadiusError_buf;
+        Buf<TAcc, float> rPhiChiSquared_buf;
+        Buf<TAcc, float> rPhiChiSquaredInwards_buf;
+        Buf<TAcc, float> rzChiSquared_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        pixelTripletsBuffer(unsigned int maxPixelTriplets,
+                            TDevAcc const & devAccIn,
+                            TQueue& queue) :
+            pixelSegmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets)),
+            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets)),
+            nPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1)),
+            totOccupancyPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1)),
+            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            tripletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            pt_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            eta_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            phi_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelTriplets*5)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets*10)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelTriplets*5)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
+            pixelRadiusError_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
+            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
+            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets))
+        {
+            alpaka::memset(queue, nPixelTriplets_buf, 0, 1);
+            alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0, 1);
+            alpaka::memset(queue, partOfPT5_buf, 0, maxPixelTriplets);
+            alpaka::wait(queue);
+        }
+    };
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct triplets& tripletsInGPU, struct pixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, float tripletRadius, float centerX, float centerY, float rPhiChiSquared, float rPhiChiSquaredInwards, float rzChiSquared, unsigned int pixelTripletIndex, float pt, float eta, float phi, float eta_pix, float phi_pix,float score)
     {
@@ -1376,14 +1460,80 @@ namespace SDL
         float* rPhiChiSquared;
         float* rPhiChiSquaredInwards;
 
-        pixelQuintuplets();
-        ~pixelQuintuplets();
-        void freeMemory(cudaStream_t stream);
-        void freeMemoryCache();
-
+        template<typename TBuff>
+        void setData(TBuff& pixelQuintupletsBuffer)
+        {
+            pixelIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelIndices_buf);
+            T5Indices = alpaka::getPtrNative(pixelQuintupletsBuffer.T5Indices_buf);
+            nPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.nPixelQuintuplets_buf);
+            totOccupancyPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.totOccupancyPixelQuintuplets_buf);
+            isDup = alpaka::getPtrNative(pixelQuintupletsBuffer.isDup_buf);
+            score = alpaka::getPtrNative(pixelQuintupletsBuffer.score_buf);
+            eta = alpaka::getPtrNative(pixelQuintupletsBuffer.eta_buf);
+            phi = alpaka::getPtrNative(pixelQuintupletsBuffer.phi_buf);
+            logicalLayers = alpaka::getPtrNative(pixelQuintupletsBuffer.logicalLayers_buf);
+            hitIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.hitIndices_buf);
+            lowerModuleIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.lowerModuleIndices_buf);
+            pixelRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelRadius_buf);
+            quintupletRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.quintupletRadius_buf);
+            centerX = alpaka::getPtrNative(pixelQuintupletsBuffer.centerX_buf);
+            centerY = alpaka::getPtrNative(pixelQuintupletsBuffer.centerY_buf);
+            rzChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rzChiSquared_buf);
+            rPhiChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquared_buf);
+            rPhiChiSquaredInwards = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquaredInwards_buf);
+        }
     };
 
-    void createPixelQuintupletsInExplicitMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, unsigned int maxPixelQuintuplets, cudaStream_t stream);
+    template<typename TAcc>
+    struct pixelQuintupletsBuffer : pixelQuintuplets
+    {
+        Buf<TAcc, unsigned int> pixelIndices_buf;
+        Buf<TAcc, unsigned int> T5Indices_buf;
+        Buf<TAcc, int> nPixelQuintuplets_buf;
+        Buf<TAcc, int> totOccupancyPixelQuintuplets_buf;
+        Buf<TAcc, bool> isDup_buf;
+        Buf<TAcc, FPX> score_buf;
+        Buf<TAcc, FPX> eta_buf;
+        Buf<TAcc, FPX> phi_buf;
+        Buf<TAcc, uint8_t> logicalLayers_buf;
+        Buf<TAcc, unsigned int> hitIndices_buf;
+        Buf<TAcc, uint16_t> lowerModuleIndices_buf;
+        Buf<TAcc, FPX> pixelRadius_buf;
+        Buf<TAcc, FPX> quintupletRadius_buf;
+        Buf<TAcc, FPX> centerX_buf;
+        Buf<TAcc, FPX> centerY_buf;
+        Buf<TAcc, float> rzChiSquared_buf;
+        Buf<TAcc, float> rPhiChiSquared_buf;
+        Buf<TAcc, float> rPhiChiSquaredInwards_buf;
+
+        template<typename TQueue, typename TDevAcc>
+        pixelQuintupletsBuffer(unsigned int maxPixelQuintuplets,
+                               TDevAcc const & devAccIn,
+                               TQueue& queue) :
+            pixelIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets)),
+            T5Indices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets)),
+            nPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1)),
+            totOccupancyPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelQuintuplets)),
+            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelQuintuplets*7)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets*14)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelQuintuplets*7)),
+            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            quintupletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets)),
+            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets)),
+            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets))
+        {
+            alpaka::memset(queue, nPixelQuintuplets_buf, 0, 1);
+            alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0, 1);
+            alpaka::wait(queue);
+        }
+    };
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct segments& segmentsInGPU, struct quintuplets& quintupletsInGPU, struct pixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, float& rzChiSquared, float& rPhiChiSquared, float& rPhiChiSquaredInwards, float score, float eta, float phi, float& pixelRadius, float& quintupletRadius, float& centerX, float& centerY)
     {
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index 60984428..f139982b 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -241,7 +241,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3)
 {
-    SDL::pixelTriplets& pixelTriplets_ = *(event->getPixelTriplets());
+    SDL::pixelTripletsBuffer<alpaka::DevCpu>& pixelTriplets_ = *(event->getPixelTriplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
@@ -341,7 +341,7 @@ std::tuple<std::vector<unsigned int>, std::vector<unsigned int>> getHitIdxsAndHi
 //____________________________________________________________________________________________
 unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5)
 {
-    SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets());
+    SDL::pixelQuintupletsBuffer<alpaka::DevCpu>& pixelQuintuplets_ = *(event->getPixelQuintuplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::modules& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
@@ -351,7 +351,7 @@ unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5)
 //____________________________________________________________________________________________
 unsigned int getT5FrompT5(SDL::Event* event, unsigned int pT5)
 {
-    SDL::pixelQuintuplets& pixelQuintuplets_ = *(event->getPixelQuintuplets());
+    SDL::pixelQuintupletsBuffer<alpaka::DevCpu>& pixelQuintuplets_ = *(event->getPixelQuintuplets());
     return pixelQuintuplets_.T5Indices[pT5];
 }
 
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 3a580c15..33f90068 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -305,7 +305,7 @@ void setOptionalOutputBranches(SDL::Event* event)
 void setPixelQuintupletOutputBranches(SDL::Event* event)
 {
     // ============ pT5 =============
-    SDL::pixelQuintuplets& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
+    SDL::pixelQuintupletsBuffer<alpaka::DevCpu>& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
     SDL::quintupletsBuffer<alpaka::DevCpu>& quintupletsInGPU = (*event->getQuintuplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::modules& modulesInGPU = (*event->getModules());
@@ -473,7 +473,7 @@ void setQuintupletOutputBranches(SDL::Event* event)
 //________________________________________________________________________________________________________________________________
 void setPixelTripletOutputBranches(SDL::Event* event)
 {
-    SDL::pixelTriplets& pixelTripletsInGPU = (*event->getPixelTriplets());
+    SDL::pixelTripletsBuffer<alpaka::DevCpu>& pixelTripletsInGPU = (*event->getPixelTriplets());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = *(event->getTriplets());
     SDL::modules& modulesInGPU = *(event->getModules());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());

From edcd381311a2321094a3a578b866881e9adc1084 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sun, 11 Jun 2023 15:39:53 -0400
Subject: [PATCH 27/44] move pixelmap + various host/copies + general cleanup

---
 SDL/Constants.cuh      |   2 +
 SDL/EndcapGeometry.cuh |   2 +
 SDL/Event.cu           | 414 ++++++++++++++++++++---------------------
 SDL/Event.cuh          |   2 +-
 SDL/Kernels.cuh        |  11 +-
 SDL/MiniDoublet.cuh    |  69 +++----
 SDL/Module.cu          | 126 +++++--------
 SDL/Module.cuh         |  28 ++-
 SDL/PixelTriplet.cuh   |   2 +
 SDL/Quintuplet.cuh     |   3 +
 SDL/Segment.cuh        |   3 +
 SDL/TrackCandidate.cuh |   7 +
 SDL/Triplet.cuh        |   3 +
 13 files changed, 341 insertions(+), 331 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 66a874a7..f6adc16a 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -102,6 +102,8 @@ const unsigned int N_MAX_TRACK_CANDIDATE_EXTENSIONS = 200000;
 const unsigned int N_MAX_TRACK_EXTENSIONS_PER_TC = 30;
 const unsigned int N_MAX_T3T3_TRACK_EXTENSIONS = 40000;
 
+const unsigned int size_superbins = 45000;
+
 namespace SDL
 {
     //defining the constant host device variables right up here
diff --git a/SDL/EndcapGeometry.cuh b/SDL/EndcapGeometry.cuh
index 28364aed..4ad71b40 100644
--- a/SDL/EndcapGeometry.cuh
+++ b/SDL/EndcapGeometry.cuh
@@ -9,6 +9,8 @@
 #include <string>
 #include <vector>
 
+#include "Constants.cuh"
+
 namespace SDL
 {
     class EndcapGeometry
diff --git a/SDL/Event.cu b/SDL/Event.cu
index c9a5871d..27a96ac7 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -1,7 +1,7 @@
 #include "Event.cuh"
 
 struct SDL::modules* SDL::modulesInGPU = nullptr;
-struct SDL::pixelMap* SDL::pixelMapping = nullptr;
+std::unique_ptr<SDL::pixelMap> SDL::pixelMapping = std::make_unique<pixelMap>();
 uint16_t SDL::nModules;
 uint16_t SDL::nLowerModules;
 
@@ -279,18 +279,15 @@ void SDL::initModules(const char* moduleMetaDataFilePath)
     if(modulesInGPU == nullptr)
     {
         cudaMallocHost(&modulesInGPU, sizeof(struct SDL::modules));
-        cudaMallocHost(&pixelMapping, sizeof(struct SDL::pixelMap));
         //nModules gets filled here
         loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath);
-        cudaStreamSynchronize(default_stream);
     }
 }
 
 void SDL::cleanModules()
 {
-    freeModules(*modulesInGPU, *pixelMapping);
+    freeModules(*modulesInGPU);
     cudaFreeHost(modulesInGPU);
-    cudaFreeHost(pixelMapping);
 }
 
 void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple)
@@ -298,7 +295,7 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     // Use the actual number of hits instead of a max.
     const int nHits = x.size();
 
-    // Needed for the memcpy to hitsInGPU below.
+    // Needed for the memcpy to hitsInGPU below. Will be replaced with a View.
     auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
     *alpaka::getPtrNative(nHits_buf) = nHits;
 
@@ -375,7 +372,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
 
     if(mdsInGPU == nullptr)
     {
-        unsigned int nTotalMDs;
         cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
 
         Vec const threadsPerBlockCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
@@ -392,6 +388,7 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         alpaka::enqueue(queue, createMDArrayRangesGPUTask);
         alpaka::wait(queue);
 
+        unsigned int nTotalMDs;
         cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
         cudaStreamSynchronize(stream);
         nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES;
@@ -527,6 +524,7 @@ void SDL::Event::addMiniDoubletsToEventExplicit()
 
         }
     }
+
     cms::cuda::free_host(nMDsCPU);
     cms::cuda::free_host(module_subdets);
     cms::cuda::free_host(module_layers);
@@ -561,6 +559,7 @@ void SDL::Event::addSegmentsToEventExplicit()
             }
         }
     }
+
     cms::cuda::free_host(nSegmentsCPU);
     cms::cuda::free_host(module_subdets);
     cms::cuda::free_host(module_layers);
@@ -568,8 +567,6 @@ void SDL::Event::addSegmentsToEventExplicit()
 
 void SDL::Event::createMiniDoublets()
 {
-    //hardcoded range numbers for this will come from studies!
-    unsigned int nTotalMDs;
     cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
 
     Vec const threadsPerBlockCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
@@ -586,9 +583,14 @@ void SDL::Event::createMiniDoublets()
     alpaka::enqueue(queue, createMDArrayRangesGPUTask);
     alpaka::wait(queue);
 
-    cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
-    nTotalMDs+=N_MAX_PIXEL_MD_PER_MODULES;
+    auto nTotalMDs_buf = allocBufWrapper<unsigned int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf, 1);
+    alpaka::wait(queue);
+
+    unsigned int nTotalMDs = *alpaka::getPtrNative(nTotalMDs_buf);
+
+    nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES;
 
     if(mdsInGPU == nullptr)
     {
@@ -597,38 +599,6 @@ void SDL::Event::createMiniDoublets()
         mdsInGPU->setData(*miniDoubletsBuffers);
     }
 
-    int maxThreadsPerModule=0;
-    int* module_hitRanges;
-    module_hitRanges = (int*)cms::cuda::allocate_host(nModules* 2*sizeof(int), stream);
-    cudaMemcpyAsync(module_hitRanges,hitsInGPU->hitRanges,nModules*2*sizeof(int),cudaMemcpyDeviceToHost,stream);
-    bool* module_isLower;
-    module_isLower = (bool*)cms::cuda::allocate_host(nModules*sizeof(bool), stream);
-    cudaMemcpyAsync(module_isLower,modulesInGPU->isLower,nModules*sizeof(bool),cudaMemcpyDeviceToHost,stream);
-    bool* module_isInverted;
-    module_isInverted = (bool*)cms::cuda::allocate_host(nModules*sizeof(bool), stream);
-    cudaMemcpyAsync(module_isInverted,modulesInGPU->isInverted,nModules*sizeof(bool),cudaMemcpyDeviceToHost,stream);
-    int* module_partnerModuleIndices;
-    module_partnerModuleIndices = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(module_partnerModuleIndices, modulesInGPU->partnerModuleIndices, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
-    cudaStreamSynchronize(stream);
-
-    for (uint16_t lowerModuleIndex=0; lowerModuleIndex<nLowerModules; lowerModuleIndex++) 
-    {
-        uint16_t upperModuleIndex = module_partnerModuleIndices[lowerModuleIndex];
-        int lowerHitRanges = module_hitRanges[lowerModuleIndex*2];
-        int upperHitRanges = module_hitRanges[upperModuleIndex*2];
-        if(lowerHitRanges!=-1 && upperHitRanges!=-1) 
-        {
-            int nLowerHits = module_hitRanges[lowerModuleIndex * 2 + 1] - lowerHitRanges + 1;
-            int nUpperHits = module_hitRanges[upperModuleIndex * 2 + 1] - upperHitRanges + 1;
-            maxThreadsPerModule = maxThreadsPerModule > (nLowerHits*nUpperHits) ? maxThreadsPerModule : nLowerHits*nUpperHits;
-        }
-    }
-    cms::cuda::free_host(module_hitRanges);
-    cms::cuda::free_host(module_partnerModuleIndices);
-    cms::cuda::free_host(module_isLower);
-    cms::cuda::free_host(module_isInverted);
-
     Vec const threadsPerBlockCreateMDInGPU(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(32));
     Vec const blocksPerGridCreateMDInGPU(static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1));
     WorkDiv const createMiniDoubletsInGPUv2_workDiv(blocksPerGridCreateMDInGPU, threadsPerBlockCreateMDInGPU, elementsPerThread);
@@ -715,8 +685,6 @@ void SDL::Event::createTriplets()
 {
     if(tripletsInGPU == nullptr)
     {
-        unsigned int maxTriplets;
-
         Vec const threadsPerBlockCreateTrip(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
         Vec const blocksPerGridCreateTrip(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
         WorkDiv const createTripletArrayRanges_workDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread);
@@ -732,33 +700,43 @@ void SDL::Event::createTriplets()
         alpaka::enqueue(queue, createTripletArrayRangesTask);
         alpaka::wait(queue);
 
-        cudaMemcpyAsync(&maxTriplets,rangesInGPU->device_nTotalTrips,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        // TODO: Why are we pulling this back down only to put it back on the device in a new struct?
+        auto maxTriplets_buf = allocBufWrapper<unsigned int>(devHost, 1);
+
+        alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf, 1);
+        alpaka::wait(queue);
 
         tripletsInGPU = new SDL::triplets();
-        tripletsBuffers = new SDL::tripletsBuffer<Acc>(maxTriplets, nLowerModules, devAcc, queue);
+        tripletsBuffers = new SDL::tripletsBuffer<Acc>(*alpaka::getPtrNative(maxTriplets_buf), nLowerModules, devAcc, queue);
         tripletsInGPU->setData(*tripletsBuffers);
 
-        cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-        cudaStreamSynchronize(stream);
+        alpaka::memcpy(queue, tripletsBuffers->nMemoryLocations_buf, maxTriplets_buf, 1);
+        alpaka::wait(queue);
     }
 
-    //TODO:Move this also inside the ranges function
-    uint16_t nonZeroModules=0;
-    unsigned int max_InnerSeg=0;
-    uint16_t *index = (uint16_t*)malloc(nLowerModules*sizeof(unsigned int));
-    uint16_t *index_gpu;
-    index_gpu = (uint16_t*)cms::cuda::allocate_device(dev, nLowerModules*sizeof(uint16_t), stream);
-    unsigned int *nSegments = (unsigned int*)malloc(nLowerModules*sizeof(unsigned int));
-    cudaMemcpyAsync((void *)nSegments, segmentsInGPU->nSegments, nLowerModules*sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    uint16_t nonZeroModules = 0;
+    unsigned int max_InnerSeg = 0;
+    
+    // Allocate host index
+    auto index_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules);
+    uint16_t *index = alpaka::getPtrNative(index_buf);
+    
+    // Allocate device index
+    auto index_gpu_buf = allocBufWrapper<uint16_t>(devAcc, nLowerModules);
+    
+    // Allocate and copy nSegments from device to host
+    auto nSegments_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules);
+    alpaka::wait(queue);
 
+    int *nSegments = alpaka::getPtrNative(nSegments_buf);
+    
     uint16_t* module_nConnectedModules;
     module_nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nLowerModules* sizeof(uint16_t), stream);
     cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
     cudaStreamSynchronize(stream);
 
-    for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex <nLowerModules; innerLowerModuleIndex++)
+    for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex < nLowerModules; innerLowerModuleIndex++)
     {
         uint16_t nConnectedModules = module_nConnectedModules[innerLowerModuleIndex];
         unsigned int nInnerSegments = nSegments[innerLowerModuleIndex];
@@ -769,9 +747,12 @@ void SDL::Event::createTriplets()
         }
         max_InnerSeg = max(max_InnerSeg, nInnerSegments);
     }
+
+    // Copy index from host to device
+    alpaka::memcpy(queue, index_gpu_buf, index_buf, nonZeroModules);
+    alpaka::wait(queue);
+
     cms::cuda::free_host(module_nConnectedModules);
-    cudaMemcpyAsync(index_gpu, index, nonZeroModules*sizeof(uint16_t), cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
 
     Vec const threadsPerBlockCreateTrip(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridCreateTrip(static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1), static_cast<Idx>(1));
@@ -786,7 +767,7 @@ void SDL::Event::createTriplets()
         *segmentsInGPU,
         *tripletsInGPU,
         *rangesInGPU,
-        index_gpu,
+        alpaka::getPtrNative(index_gpu_buf),
         nonZeroModules));
 
     alpaka::enqueue(queue, createTripletsInGPUv2Task);
@@ -806,10 +787,6 @@ void SDL::Event::createTriplets()
     alpaka::enqueue(queue, addTripletRangesToEventExplicitTask);
     alpaka::wait(queue);
 
-    free(nSegments);
-    free(index);
-    cms::cuda::free_device(dev, index_gpu);
-
     if(addObjects)
     {
         addTripletsToEventExplicit();
@@ -818,8 +795,6 @@ void SDL::Event::createTriplets()
 
 void SDL::Event::createTrackCandidates()
 {
-    uint16_t nEligibleModules;
-    cudaMemcpyAsync(&nEligibleModules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
     if(trackCandidatesInGPU == nullptr)
     {
         trackCandidatesInGPU = new SDL::trackCandidates();
@@ -827,6 +802,11 @@ void SDL::Event::createTrackCandidates()
         trackCandidatesInGPU->setData(*trackCandidatesBuffers);
     }
 
+    // Pull nEligibleT5Modules from the device.
+    auto nEligibleModules_buf = allocBufWrapper<uint16_t>(devHost, 1);
+    alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1);
+    uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf);
+
     Vec const threadsPerBlock_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(64));
     Vec const blocksPerGrid_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(4), static_cast<Idx>(20));
     WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, blocksPerGrid_crossCleanpT3, elementsPerThread);
@@ -843,7 +823,6 @@ void SDL::Event::createTrackCandidates()
 
     alpaka::enqueue(queue, crossCleanpT3Task);
 
-    //adding objects
     Vec const threadsPerBlock_addpT3asTrackCandidatesInGPU(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(512));
     Vec const blocksPerGrid_addpT3asTrackCandidatesInGPU(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
     WorkDiv const addpT3asTrackCandidatesInGPU_workDiv(blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread);
@@ -962,30 +941,27 @@ void SDL::Event::createPixelTriplets()
         pixelTripletsInGPU->setData(*pixelTripletsBuffers);
     }
 
-    unsigned int pixelModuleIndex = nLowerModules;
-    int* superbins;
-    int8_t* pixelTypes;
-    unsigned int *nTriplets;
-    unsigned int nInnerSegments = 0;
-    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[pixelModuleIndex]), sizeof(int), cudaMemcpyDeviceToHost,stream);
-    nTriplets = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(nTriplets, tripletsInGPU->nTriplets, nLowerModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-    superbins = (int*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int), stream);
-    pixelTypes = (int8_t*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t), stream);
-
-    cudaMemcpyAsync(superbins,segmentsInGPU->superbin,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int),cudaMemcpyDeviceToHost,stream);
-    cudaMemcpyAsync(pixelTypes,segmentsInGPU->pixelType,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t),cudaMemcpyDeviceToHost,stream);
-
-    unsigned int* connectedPixelSize_host;
-    unsigned int* connectedPixelIndex_host;
-    connectedPixelSize_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream);
-    connectedPixelIndex_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream);
-    unsigned int* connectedPixelSize_dev;
-    unsigned int* connectedPixelIndex_dev;
-    connectedPixelSize_dev = (unsigned int*)cms::cuda::allocate_device(dev, nInnerSegments*sizeof(unsigned int), stream);
-    connectedPixelIndex_dev = (unsigned int*)cms::cuda::allocate_device(dev, nInnerSegments*sizeof(unsigned int), stream);
+    unsigned int nInnerSegments;
+    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(int), cudaMemcpyDeviceToHost,stream);
+
+    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+
+    alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    alpaka::wait(queue);
+
+    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
+    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
+
+    int* superbins = alpaka::getPtrNative(superbins_buf);
+    int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf);
+    unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf);
+    unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf);
+    alpaka::wait(queue);
 
-    cudaStreamSynchronize(stream);
     int pixelIndexOffsetPos = pixelMapping->connectedPixelsIndex[44999] + pixelMapping->connectedPixelsSizes[44999];
     int pixelIndexOffsetNeg = pixelMapping->connectedPixelsIndexPos[44999] + pixelMapping->connectedPixelsSizes[44999] + pixelIndexOffsetPos;
 
@@ -993,8 +969,8 @@ void SDL::Event::createPixelTriplets()
     // the current selection still leaves a significant fraction of unmatchable pLSs
     for (unsigned int i = 0; i < nInnerSegments; i++)
     {// loop over # pLS
-        int8_t pixelType = pixelTypes[i];// get pixel type for this pLS
-        int superbin = superbins[i]; //get superbin for this pixel
+        int8_t pixelType = pixelTypes[i];// Get pixel type for this pLS
+        int superbin = superbins[i]; // Get superbin for this pixel
         if((superbin < 0) or (superbin >= 45000) or (pixelType > 2) or (pixelType < 0))
         {
             connectedPixelSize_host[i] = 0;
@@ -1002,37 +978,30 @@ void SDL::Event::createPixelTriplets()
             continue;
         }
 
-        if(pixelType ==0)
-        { // used pixel type to select correct size-index arrays
-            connectedPixelSize_host[i]  = pixelMapping->connectedPixelsSizes[superbin]; //number of connected modules to this pixel
+        // Used pixel type to select correct size-index arrays
+        if(pixelType == 0)
+        {
+            connectedPixelSize_host[i]  = pixelMapping->connectedPixelsSizes[superbin]; // number of connected modules to this pixel
             auto connectedIdxBase = pixelMapping->connectedPixelsIndex[superbin];
-            connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected modules for this superbin in map
-            // printf("i %d out of nInnerSegments %d type %d superbin %d connectedPixelIndex %d connectedPixelSize %d\n",
-            //        i, nInnerSegments, pixelType, superbin, connectedPixelIndex_host[i], connectedPixelSize_host[i]);
+            connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected modules for this superbin in map
         }
-        else if(pixelType ==1)
+        else if(pixelType == 1)
         {
-            connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; //number of pixel connected modules
+            connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; // number of pixel connected modules
             auto connectedIdxBase = pixelMapping->connectedPixelsIndexPos[superbin]+pixelIndexOffsetPos;
-            connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected pixel modules
+            connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules
         }
-        else if(pixelType ==2)
+        else if(pixelType == 2)
         {
-            connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules
+            connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; // number of pixel connected modules
             auto connectedIdxBase = pixelMapping->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg;
-            connectedPixelIndex_host[i] = connectedIdxBase;// index to get start of connected pixel modules
+            connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules
         }
     }
 
-    cudaMemcpyAsync(connectedPixelSize_dev, connectedPixelSize_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(connectedPixelIndex_dev, connectedPixelIndex_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
-
-    cms::cuda::free_host(connectedPixelSize_host);
-    cms::cuda::free_host(connectedPixelIndex_host);
-    cms::cuda::free_host(superbins);
-    cms::cuda::free_host(pixelTypes);
-    cms::cuda::free_host(nTriplets);
+    alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments);
+    alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments);
+    alpaka::wait(queue);
 
     Vec const threadsPerBlock(static_cast<Idx>(1), static_cast<Idx>(4), static_cast<Idx>(32));
     Vec const blocksPerGrid(static_cast<Idx>(16 /* above median of connected modules*/), static_cast<Idx>(4096), static_cast<Idx>(1));
@@ -1048,21 +1017,20 @@ void SDL::Event::createPixelTriplets()
         *segmentsInGPU,
         *tripletsInGPU,
         *pixelTripletsInGPU,
-        connectedPixelSize_dev,
-        connectedPixelIndex_dev,
+        alpaka::getPtrNative(connectedPixelSize_dev_buf),
+        alpaka::getPtrNative(connectedPixelIndex_dev_buf),
         nInnerSegments));
 
     alpaka::enqueue(queue, createPixelTripletsInGPUFromMapv2Task);
     alpaka::wait(queue);
 
-    cms::cuda::free_device(dev, connectedPixelSize_dev);
-    cms::cuda::free_device(dev, connectedPixelIndex_dev);
-
 #ifdef Warnings
-    int nPixelTriplets;
-    cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets,  sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
-    std::cout<<"number of pixel triplets = "<<nPixelTriplets<<std::endl;
+    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
+    alpaka::wait(queue);
+
+    std::cout << "number of pixel triplets = " << *alpaka::getPtrNative(nPixelTriplets_buf) << std::endl;
 #endif
 
     //pT3s can be cleaned here because they're not used in making pT5s!
@@ -1084,9 +1052,6 @@ void SDL::Event::createPixelTriplets()
 
 void SDL::Event::createQuintuplets()
 {
-    uint16_t nEligibleT5Modules = 0;
-    unsigned int nTotalQuintuplets;
-
     Vec const threadsPerBlockCreateQuints(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
     Vec const blocksPerGridCreateQuints(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
     WorkDiv const createEligibleModulesListForQuintupletsGPU_workDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread);
@@ -1102,9 +1067,15 @@ void SDL::Event::createQuintuplets()
     alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask);
     alpaka::wait(queue);
 
-    cudaMemcpyAsync(&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
-    cudaMemcpyAsync(&nTotalQuintuplets,rangesInGPU->device_nTotalQuints,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nEligibleT5Modules_buf = allocBufWrapper<uint16_t>(devHost, 1);
+    auto nTotalQuintuplets_buf = allocBufWrapper<unsigned int>(devHost, 1);
+
+    alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf, 1);
+    alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf, 1);
+    alpaka::wait(queue);
+
+    uint16_t nEligibleT5Modules = *alpaka::getPtrNative(nEligibleT5Modules_buf);
+    unsigned int nTotalQuintuplets = *alpaka::getPtrNative(nTotalQuintuplets_buf);
 
     if(quintupletsInGPU == nullptr)
     {
@@ -1112,8 +1083,8 @@ void SDL::Event::createQuintuplets()
         quintupletsBuffers = new SDL::quintupletsBuffer<Acc>(nTotalQuintuplets, nLowerModules, devAcc, queue);
         quintupletsInGPU->setData(*quintupletsBuffers);
 
-        cudaMemcpyAsync(quintupletsInGPU->nMemoryLocations, &nTotalQuintuplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-        cudaStreamSynchronize(stream);
+        alpaka::memcpy(queue, quintupletsBuffers->nMemoryLocations_buf, nTotalQuintuplets_buf, 1);
+        alpaka::wait(queue);
     }
 
     Vec const threadsPerBlockQuints(static_cast<Idx>(1), static_cast<Idx>(8), static_cast<Idx>(32));
@@ -1200,64 +1171,57 @@ void SDL::Event::createPixelQuintuplets()
         trackCandidatesInGPU = new SDL::trackCandidates();
         trackCandidatesBuffers = new SDL::trackCandidatesBuffer<Acc>(N_MAX_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue);
         trackCandidatesInGPU->setData(*trackCandidatesBuffers);
-    } 
-
-    unsigned int pixelModuleIndex;
-    int* superbins;
-    int8_t* pixelTypes;
-    int *nQuintuplets;
+    }
 
-    unsigned int* connectedPixelSize_host;
-    unsigned int* connectedPixelIndex_host;
-    unsigned int* connectedPixelSize_dev;
-    unsigned int* connectedPixelIndex_dev;
+    unsigned int nInnerSegments;
+    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
 
-    nQuintuplets = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(int), stream);
-    cudaMemcpyAsync(nQuintuplets, quintupletsInGPU->nQuintuplets, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
+    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
 
-    superbins = (int*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int), stream);
-    pixelTypes = (int8_t*)cms::cuda::allocate_host(N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t), stream);
+    alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    alpaka::wait(queue);
 
-    cudaMemcpyAsync(superbins,segmentsInGPU->superbin,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int),cudaMemcpyDeviceToHost,stream);
-    cudaMemcpyAsync(pixelTypes,segmentsInGPU->pixelType,N_MAX_PIXEL_SEGMENTS_PER_MODULE*sizeof(int8_t),cudaMemcpyDeviceToHost,stream);
+    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
+    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
 
-    cudaStreamSynchronize(stream);
-    pixelModuleIndex = nLowerModules;
-    unsigned int nInnerSegments = 0;
-    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[pixelModuleIndex]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-    connectedPixelSize_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream);
-    connectedPixelIndex_host = (unsigned int*)cms::cuda::allocate_host(nInnerSegments* sizeof(unsigned int), stream);
-    connectedPixelSize_dev = (unsigned int*)cms::cuda::allocate_device(dev,nInnerSegments* sizeof(unsigned int),stream);
-    connectedPixelIndex_dev = (unsigned int*)cms::cuda::allocate_device(dev,nInnerSegments* sizeof(unsigned int),stream);
-    cudaStreamSynchronize(stream);
+    int* superbins = alpaka::getPtrNative(superbins_buf);
+    int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf);
+    unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf);
+    unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf);
+    alpaka::wait(queue);
 
     int pixelIndexOffsetPos = pixelMapping->connectedPixelsIndex[44999] + pixelMapping->connectedPixelsSizes[44999];
     int pixelIndexOffsetNeg = pixelMapping->connectedPixelsIndexPos[44999] + pixelMapping->connectedPixelsSizes[44999] + pixelIndexOffsetPos;
 
+    // Loop over # pLS
     for (unsigned int i = 0; i < nInnerSegments; i++)
-    {// loop over # pLS
-        int8_t pixelType = pixelTypes[i];// get pixel type for this pLS
-        int superbin = superbins[i]; //get superbin for this pixel
+    {
+        int8_t pixelType = pixelTypes[i];// Get pixel type for this pLS
+        int superbin = superbins[i]; // Get superbin for this pixel
         if((superbin < 0) or (superbin >= 45000) or (pixelType > 2) or (pixelType < 0))
         {
             connectedPixelIndex_host[i] = 0;
             connectedPixelSize_host[i] = 0;
             continue;
         }
-
-        if(pixelType ==0)
-        { // used pixel type to select correct size-index arrays
+        // Used pixel type to select correct size-index arrays
+        if(pixelType == 0)
+        {
             connectedPixelSize_host[i]  = pixelMapping->connectedPixelsSizes[superbin]; //number of connected modules to this pixel
             unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndex[superbin];
             connectedPixelIndex_host[i] = connectedIdxBase;
         }
-        else if(pixelType ==1)
+        else if(pixelType == 1)
         {
             connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesPos[superbin]; //number of pixel connected modules
             unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndexPos[superbin]+pixelIndexOffsetPos;
             connectedPixelIndex_host[i] = connectedIdxBase;
         }
-        else if(pixelType ==2)
+        else if(pixelType == 2)
         {
             connectedPixelSize_host[i] = pixelMapping->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules
             unsigned int connectedIdxBase = pixelMapping->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg;
@@ -1265,9 +1229,9 @@ void SDL::Event::createPixelQuintuplets()
         }
     }
 
-    cudaMemcpyAsync(connectedPixelSize_dev, connectedPixelSize_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(connectedPixelIndex_dev, connectedPixelIndex_host, nInnerSegments*sizeof(unsigned int), cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
+    alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments);
+    alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments);
+    alpaka::wait(queue);
 
     Vec const threadsPerBlockCreatePixQuints(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridCreatePixQuints(static_cast<Idx>(16), static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1));
@@ -1283,21 +1247,12 @@ void SDL::Event::createPixelQuintuplets()
         *tripletsInGPU,
         *quintupletsInGPU,
         *pixelQuintupletsInGPU,
-        connectedPixelSize_dev,
-        connectedPixelIndex_dev,
+        alpaka::getPtrNative(connectedPixelSize_dev_buf),
+        alpaka::getPtrNative(connectedPixelIndex_dev_buf),
         nInnerSegments,
         *rangesInGPU));
 
     alpaka::enqueue(queue, createPixelQuintupletsInGPUFromMapv2Task);
-    alpaka::wait(queue);
-
-    cms::cuda::free_host(superbins);
-    cms::cuda::free_host(pixelTypes);
-    cms::cuda::free_host(nQuintuplets);
-    cms::cuda::free_host(connectedPixelSize_host);
-    cms::cuda::free_host(connectedPixelIndex_host);
-    cms::cuda::free_device(dev, connectedPixelSize_dev);
-    cms::cuda::free_device(dev, connectedPixelIndex_dev);
 
     Vec const threadsPerBlockDupPix(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridDupPix(static_cast<Idx>(1), static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1));
@@ -1311,7 +1266,6 @@ void SDL::Event::createPixelQuintuplets()
         false));
 
     alpaka::enqueue(queue, removeDupPixelQuintupletsInGPUFromMapTask);
-    alpaka::wait(queue);
 
     Vec const threadsPerBlockAddpT5asTrackCan(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
     Vec const blocksPerGridAddpT5asTrackCan(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
@@ -1329,12 +1283,15 @@ void SDL::Event::createPixelQuintuplets()
 
     alpaka::enqueue(queue, addpT5asTrackCandidateInGPUTask);
     alpaka::wait(queue);
+
 #ifdef Warnings
-    int nPixelQuintuplets;
-    cudaMemcpyAsync(&nPixelQuintuplets, &(pixelQuintupletsInGPU->nPixelQuintuplets), sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
-    std::cout<<"number of pixel quintuplets = "<<nPixelQuintuplets<<std::endl;
-#endif   
+    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
+    alpaka::wait(queue);
+
+    std::cout << "number of pixel quintuplets = " << *alpaka::getPtrNative(nPixelQuintuplets_buf) << std::endl;
+#endif
 }
 
 void SDL::Event::addQuintupletsToEventExplicit()
@@ -1541,17 +1498,25 @@ unsigned int SDL::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer)
 
 int SDL::Event::getNumberOfPixelTriplets()
 {
-    int nPixelTriplets;
-    cudaMemcpyAsync(&nPixelTriplets, pixelTripletsInGPU->nPixelTriplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
+    alpaka::wait(queue);
+
+    int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf);
+
     return nPixelTriplets;
 }
 
 int SDL::Event::getNumberOfPixelQuintuplets()
 {
-    int nPixelQuintuplets;
-    cudaMemcpyAsync(&nPixelQuintuplets, pixelQuintupletsInGPU->nPixelQuintuplets, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
+    alpaka::wait(queue);
+
+    int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf);
+
     return nPixelQuintuplets;
 }
 
@@ -1589,57 +1554,78 @@ unsigned int SDL::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer)
 }
 
 int SDL::Event::getNumberOfTrackCandidates()
-{    
-    int nTrackCandidates;
-    cudaMemcpyAsync(&nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+{
+    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
+    alpaka::wait(queue);
+
+    int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf);
 
     return nTrackCandidates;
 }
 
 int SDL::Event::getNumberOfPT5TrackCandidates()
 {
-    int nTrackCandidatesPT5;
-    cudaMemcpyAsync(&nTrackCandidatesPT5, trackCandidatesInGPU->nTrackCandidatespT5, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nTrackCandidatesPT5_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf, 1);
+    alpaka::wait(queue);
+
+    int nTrackCandidatesPT5 = *alpaka::getPtrNative(nTrackCandidatesPT5_buf);
 
     return nTrackCandidatesPT5;
 }
 
 int SDL::Event::getNumberOfPT3TrackCandidates()
 {
-    int nTrackCandidatesPT3;
-    cudaMemcpyAsync(&nTrackCandidatesPT3, trackCandidatesInGPU->nTrackCandidatespT3, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nTrackCandidatesPT3_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf, 1);
+    alpaka::wait(queue);
+
+    int nTrackCandidatesPT3 = *alpaka::getPtrNative(nTrackCandidatesPT3_buf);
 
     return nTrackCandidatesPT3;
 }
 
 int SDL::Event::getNumberOfPLSTrackCandidates()
 {
-    unsigned int nTrackCandidatesPLS;
-    cudaMemcpyAsync(&nTrackCandidatesPLS, trackCandidatesInGPU->nTrackCandidatespLS, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nTrackCandidatesPLS_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf, 1);
+    alpaka::wait(queue);
+
+    unsigned int nTrackCandidatesPLS = *alpaka::getPtrNative(nTrackCandidatesPLS_buf);
 
     return nTrackCandidatesPLS;
 }
 
 int SDL::Event::getNumberOfPixelTrackCandidates()
 {
-    int nTrackCandidates;
-    int nTrackCandidatesT5;
-    cudaMemcpyAsync(&nTrackCandidates, trackCandidatesInGPU->nTrackCandidates, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaMemcpyAsync(&nTrackCandidatesT5, trackCandidatesInGPU->nTrackCandidatesT5, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
+    alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1);
+    alpaka::wait(queue);
+
+    int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf);
+    int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf);
 
     return nTrackCandidates - nTrackCandidatesT5;
 }
 
 int SDL::Event::getNumberOfT5TrackCandidates()
 {
-    int nTrackCandidatesT5;
-    cudaMemcpyAsync(&nTrackCandidatesT5, trackCandidatesInGPU->nTrackCandidatesT5, sizeof(int), cudaMemcpyDeviceToHost,stream);
-    return nTrackCandidatesT5; 
+    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1);
+
+    alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1);
+    alpaka::wait(queue);
+
+    int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf);
+
+    return nTrackCandidatesT5;
 }
 
 SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index b512b469..f5a671bf 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -161,6 +161,6 @@ namespace SDL
     void initModules(const char* moduleMetaDataFilePath="data/centroid.txt"); //read from file and init
     void cleanModules();
     void initModulesHost(); //read from file and init
-    extern struct pixelMap* pixelMapping;
+    extern std::unique_ptr<SDL::pixelMap> pixelMapping;
 }
 #endif
diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index 8fd7d952..51ff3e95 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -182,15 +182,15 @@ namespace SDL
         int nMatched = 0;
         for (int i = 0; i < 6; i++)
         {
-            bool matched = false;
+            bool tmatched = false;
             for (int j = 0; j < 6; j++)
             {
                 if(hits1[i] == hits2[j])
                 {
-                    matched = true; break;
+                    tmatched = true; break;
                 }
             }
-            if(matched)
+            if(tmatched)
             {
                 nMatched++;
             }
@@ -202,6 +202,7 @@ namespace SDL
 
     struct removeDupQuintupletsInGPUAfterBuild
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -272,6 +273,7 @@ namespace SDL
 
     struct removeDupQuintupletsInGPUBeforeTC
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -357,6 +359,7 @@ namespace SDL
 
     struct removeDupPixelTripletsInGPUFromMap
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -404,6 +407,7 @@ namespace SDL
 
     struct removeDupPixelQuintupletsInGPUFromMap
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -448,6 +452,7 @@ namespace SDL
 
     struct checkHitspLS
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index a75dcfb0..74897eb3 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -137,45 +137,45 @@ namespace SDL
         Buf<TAcc, float> outerLowEdgeY_buf;
 
         template<typename TQueue, typename TDevAcc>
-        miniDoubletsBuffer(unsigned int nMemoryLocations,
+        miniDoubletsBuffer(unsigned int nMemoryLoc,
                            uint16_t nLowerModules,
                            TDevAcc const & devAccIn,
                            TQueue& queue) :
             nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            anchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocations)),
-            outerHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocations)),
-            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocations)),
+            anchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc)),
+            outerHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc)),
+            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLoc)),
             nMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
             totOccupancyMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            dphichanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            dzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            dphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            shiftedXs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            shiftedYs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            shiftedZs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            noShiftedDzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            noShiftedDphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            noShiftedDphiChanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            anchorLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations)),
-            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLocations))
+            dphichanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            dzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            dphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            shiftedXs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            shiftedYs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            shiftedZs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            noShiftedDzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            noShiftedDphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            noShiftedDphiChanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            anchorLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
+            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc))
         {
             alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1);
             alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1);
@@ -770,6 +770,7 @@ namespace SDL
 
     struct createMiniDoubletsInGPUv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -843,6 +844,7 @@ namespace SDL
 
     struct createMDArrayRangesGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -912,6 +914,7 @@ namespace SDL
 
     struct addMiniDoubletRangesToEventExplicit
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Module.cu b/SDL/Module.cu
index 01c0b162..649995ba 100644
--- a/SDL/Module.cu
+++ b/SDL/Module.cu
@@ -39,7 +39,7 @@ void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned in
     cudaStreamSynchronize(stream);
 }
 
-void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMapping)
+void SDL::freeModules(struct modules& modulesInGPU)
 {
     cudaFree(modulesInGPU.detIds);
     cudaFree(modulesInGPU.moduleMap);
@@ -65,13 +65,6 @@ void SDL::freeModules(struct modules& modulesInGPU, struct pixelMap& pixelMappin
     cudaFree(modulesInGPU.moduleLayerType);
     cudaFree(modulesInGPU.connectedPixels);
     cudaFree(modulesInGPU.partnerModuleIndices);
-
-    cudaFreeHost(pixelMapping.connectedPixelsSizes);
-    cudaFreeHost(pixelMapping.connectedPixelsSizesPos);
-    cudaFreeHost(pixelMapping.connectedPixelsSizesNeg);
-    cudaFreeHost(pixelMapping.connectedPixelsIndex);
-    cudaFreeHost(pixelMapping.connectedPixelsIndexPos);
-    cudaFreeHost(pixelMapping.connectedPixelsIndexNeg);
 }
 
 void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, uint16_t& nLowerModules, struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath)
@@ -130,41 +123,43 @@ void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,
     nModules = counter;
     //std::cout<<"Number of modules = "<<nModules<<std::endl;
     createModulesInExplicitMemory(modulesInGPU,nModules,stream);
-    unsigned int* host_detIds;
-    short* host_layers;
-    short* host_rings;
-    short* host_rods;
-    short* host_modules;
-    short* host_subdets;
-    short* host_sides;
-    float* host_eta;
-    float* host_r;
-    bool* host_isInverted;
-    bool* host_isLower;
-    bool* host_isAnchor;
-    ModuleType* host_moduleType;
-    ModuleLayerType* host_moduleLayerType;
-    float* host_slopes;
-    float* host_drdzs;
-    uint16_t* host_partnerModuleIndices;
-
-    host_detIds = (unsigned int*)cms::cuda::allocate_host(sizeof(unsigned int)*nModules, stream);
-    host_layers = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_rings = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_rods = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_modules = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_subdets = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_sides = (short*)cms::cuda::allocate_host(sizeof(short)*nModules, stream);
-    host_eta = (float*)cms::cuda::allocate_host(sizeof(float)*nModules, stream);
-    host_r = (float*)cms::cuda::allocate_host(sizeof(float)*nModules, stream);
-    host_isInverted = (bool*)cms::cuda::allocate_host(sizeof(bool)*nModules, stream);
-    host_isLower = (bool*)cms::cuda::allocate_host(sizeof(bool)*nModules, stream);
-    host_isAnchor = (bool*)cms::cuda::allocate_host(sizeof(bool)*nModules, stream);
-    host_moduleType = (ModuleType*)cms::cuda::allocate_host(sizeof(ModuleType)*nModules, stream);
-    host_moduleLayerType = (ModuleLayerType*)cms::cuda::allocate_host(sizeof(ModuleLayerType)*nModules, stream);
-    host_slopes = (float*)cms::cuda::allocate_host(sizeof(float)*nModules, stream);
-    host_drdzs = (float*)cms::cuda::allocate_host(sizeof(float)*nModules, stream);
-    host_partnerModuleIndices = (uint16_t*)cms::cuda::allocate_host(sizeof(uint16_t) * nModules, stream);
+
+    auto detIds_buf = allocBufWrapper<unsigned int>(devHost, nModules);
+    auto layers_buf = allocBufWrapper<short>(devHost, nModules);
+    auto rings_buf = allocBufWrapper<short>(devHost, nModules);
+    auto rods_buf = allocBufWrapper<short>(devHost, nModules);
+    auto modules_buf = allocBufWrapper<short>(devHost, nModules);
+    auto subdets_buf = allocBufWrapper<short>(devHost, nModules);
+    auto sides_buf = allocBufWrapper<short>(devHost, nModules);
+    auto eta_buf = allocBufWrapper<float>(devHost, nModules);
+    auto r_buf = allocBufWrapper<float>(devHost, nModules);
+    auto isInverted_buf = allocBufWrapper<bool>(devHost, nModules);
+    auto isLower_buf = allocBufWrapper<bool>(devHost, nModules);
+    auto isAnchor_buf = allocBufWrapper<bool>(devHost, nModules);
+    auto moduleType_buf = allocBufWrapper<ModuleType>(devHost, nModules);
+    auto moduleLayerType_buf = allocBufWrapper<ModuleLayerType>(devHost, nModules);
+    auto slopes_buf = allocBufWrapper<float>(devHost, nModules);
+    auto drdzs_buf = allocBufWrapper<float>(devHost, nModules);
+    auto partnerModuleIndices_buf = allocBufWrapper<uint16_t>(devHost, nModules);
+
+    // Getting the underlying data pointers
+    unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf);
+    short* host_layers = alpaka::getPtrNative(layers_buf);
+    short* host_rings = alpaka::getPtrNative(rings_buf);
+    short* host_rods = alpaka::getPtrNative(rods_buf);
+    short* host_modules = alpaka::getPtrNative(modules_buf);
+    short* host_subdets = alpaka::getPtrNative(subdets_buf);
+    short* host_sides = alpaka::getPtrNative(sides_buf);
+    float* host_eta = alpaka::getPtrNative(eta_buf);
+    float* host_r = alpaka::getPtrNative(r_buf);
+    bool* host_isInverted = alpaka::getPtrNative(isInverted_buf);
+    bool* host_isLower = alpaka::getPtrNative(isLower_buf);
+    bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf);
+    ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf);
+    ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf);
+    float* host_slopes = alpaka::getPtrNative(slopes_buf);
+    float* host_drdzs = alpaka::getPtrNative(drdzs_buf);
+    uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf);
     
     //reassign detIdToIndex indices here
     nLowerModules = (nModules - 1) / 2;
@@ -304,24 +299,6 @@ void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,
     cudaMemcpyAsync(modulesInGPU.partnerModuleIndices, host_partnerModuleIndices, sizeof(uint16_t) * nModules, cudaMemcpyHostToDevice, stream);
     cudaStreamSynchronize(stream);
 
-    cms::cuda::free_host(host_detIds);
-    cms::cuda::free_host(host_layers);
-    cms::cuda::free_host(host_rings);
-    cms::cuda::free_host(host_rods);
-    cms::cuda::free_host(host_modules);
-    cms::cuda::free_host(host_subdets);
-    cms::cuda::free_host(host_sides);
-    cms::cuda::free_host(host_eta);
-    cms::cuda::free_host(host_r);
-    cms::cuda::free_host(host_isInverted);
-    cms::cuda::free_host(host_isLower);
-    cms::cuda::free_host(host_isAnchor);
-    cms::cuda::free_host(host_moduleType);
-    cms::cuda::free_host(host_moduleLayerType);
-    cms::cuda::free_host(host_slopes);
-    cms::cuda::free_host(host_drdzs);
-    cms::cuda::free_host(host_partnerModuleIndices);
-
     fillConnectedModuleArrayExplicit(modulesInGPU,nModules,stream);
     fillMapArraysExplicit(modulesInGPU, nModules, stream);
     fillPixelMap(modulesInGPU,pixelMapping,stream);
@@ -344,21 +321,14 @@ void SDL::fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nM
 
 void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMapping,cudaStream_t stream)
 {
-    int size_superbins = 45000; //changed to 45000 to reduce memory useage on GPU
     std::vector<unsigned int> connectedModuleDetIds;
     std::vector<unsigned int> connectedModuleDetIds_pos;
     std::vector<unsigned int> connectedModuleDetIds_neg;
-    cudaMallocHost(&pixelMapping.connectedPixelsIndex,size_superbins * sizeof(unsigned int));
-    cudaMallocHost(&pixelMapping.connectedPixelsSizes,size_superbins * sizeof(unsigned int));
-    cudaMallocHost(&pixelMapping.connectedPixelsIndexPos,size_superbins * sizeof(unsigned int));
-    cudaMallocHost(&pixelMapping.connectedPixelsSizesPos,size_superbins * sizeof(unsigned int));
-    cudaMallocHost(&pixelMapping.connectedPixelsIndexNeg,size_superbins * sizeof(unsigned int));
-    cudaMallocHost(&pixelMapping.connectedPixelsSizesNeg,size_superbins * sizeof(unsigned int));
-
-    int totalSizes=0;
-    int totalSizes_pos=0;
-    int totalSizes_neg=0;
-    for(int isuperbin =0; isuperbin<size_superbins; isuperbin++)
+
+    int totalSizes = 0;
+    int totalSizes_pos = 0;
+    int totalSizes_neg = 0;
+    for(unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++)
     {
         std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values
         std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins
@@ -375,7 +345,7 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi
         connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end());
         connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end());
 
-        int sizes =0;
+        int sizes = 0;
         sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size();
         sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size();
         sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size();
@@ -402,7 +372,7 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi
         connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end());
         connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end());
 
-        int sizes_pos =0;
+        int sizes_pos = 0;
         sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size();
         sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size();
         sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size();
@@ -446,15 +416,15 @@ void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMappi
     connectedPixels = (unsigned int*)cms::cuda::allocate_host((totalSizes+totalSizes_pos+totalSizes_neg) * sizeof(unsigned int), stream);
     cudaMalloc(&modulesInGPU.connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)* sizeof(unsigned int));
 
-    for(int icondet=0; icondet< totalSizes; icondet++)
+    for(int icondet = 0; icondet < totalSizes; icondet++)
     {
         connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]];
     }
-    for(int icondet=0; icondet< totalSizes_pos; icondet++)
+    for(int icondet = 0; icondet < totalSizes_pos; icondet++)
     {
         connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]];
     }
-    for(int icondet=0; icondet< totalSizes_neg; icondet++)
+    for(int icondet = 0; icondet < totalSizes_neg; icondet++)
     {
         connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]];
     }
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index d4e1457f..3967c764 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -230,8 +230,17 @@ namespace SDL
         unsigned int* connectedPixelsSizesNeg;
     };
 
+    // PixelMap is never allocated on the device.
+    // This is also not passed to any of the kernels, so we can combine the structs.
     struct pixelMap
     {
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsIndex_buf;
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsSizes_buf;
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsIndexPos_buf;
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsSizesPos_buf;
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsIndexNeg_buf;
+        Buf<alpaka::DevCpu, unsigned int> connectedPixelsSizesNeg_buf;
+
         unsigned int* connectedPixelsIndex;
         unsigned int* connectedPixelsSizes;
         unsigned int* connectedPixelsIndexPos;
@@ -239,8 +248,23 @@ namespace SDL
         unsigned int* connectedPixelsIndexNeg;
         unsigned int* connectedPixelsSizesNeg;
 
-        int* superbin;
         int* pixelType;
+
+        pixelMap(unsigned int sizef = size_superbins) :
+            connectedPixelsIndex_buf(allocBufWrapper<unsigned int>(devHost, sizef)),
+            connectedPixelsSizes_buf(allocBufWrapper<unsigned int>(devHost, sizef)),
+            connectedPixelsIndexPos_buf(allocBufWrapper<unsigned int>(devHost, sizef)),
+            connectedPixelsSizesPos_buf(allocBufWrapper<unsigned int>(devHost, sizef)),
+            connectedPixelsIndexNeg_buf(allocBufWrapper<unsigned int>(devHost, sizef)),
+            connectedPixelsSizesNeg_buf(allocBufWrapper<unsigned int>(devHost, sizef))
+        {
+            connectedPixelsIndex = alpaka::getPtrNative(connectedPixelsIndex_buf);
+            connectedPixelsSizes = alpaka::getPtrNative(connectedPixelsSizes_buf);
+            connectedPixelsIndexPos = alpaka::getPtrNative(connectedPixelsIndexPos_buf);
+            connectedPixelsSizesPos = alpaka::getPtrNative(connectedPixelsSizesPos_buf);
+            connectedPixelsIndexNeg = alpaka::getPtrNative(connectedPixelsIndexNeg_buf);
+            connectedPixelsSizesNeg = alpaka::getPtrNative(connectedPixelsSizesNeg_buf);
+        }
     };
 
     extern std::map <unsigned int, uint16_t>* detIdToIndex;
@@ -251,7 +275,7 @@ namespace SDL
 
     void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt");
     void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream);
-    void freeModules(struct modules& modulesInGPU,struct pixelMap& pixelMapping);
+    void freeModules(struct modules& modulesInGPU);
     void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream);
     void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
     void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index c40f8283..033dea78 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -846,6 +846,7 @@ namespace SDL
 
     struct createPixelTripletsInGPUFromMapv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2245,6 +2246,7 @@ namespace SDL
 
     struct createPixelQuintupletsInGPUFromMapv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index 3a8aa7e6..6bf87ca0 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -2164,6 +2164,7 @@ namespace SDL
 
     struct createQuintupletsInGPUv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2258,6 +2259,7 @@ namespace SDL
 
     struct createEligibleModulesListForQuintupletsGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2333,6 +2335,7 @@ namespace SDL
 
     struct addQuintupletRangesToEventExplicit
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index d6308cb0..c2f9aef5 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -678,6 +678,7 @@ namespace SDL
 
     struct createSegmentsInGPUv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -754,6 +755,7 @@ namespace SDL
 
     struct createSegmentArrayRanges
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -831,6 +833,7 @@ namespace SDL
 
     struct addSegmentRangesToEventExplicit
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index c11ae247..9abaa754 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -190,6 +190,7 @@ namespace SDL
 
     struct crossCleanpT3
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -238,6 +239,7 @@ namespace SDL
 
     struct crossCleanT5
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -304,6 +306,7 @@ namespace SDL
     // This will eliminate the need for another kernel just for adding the pLS, because we can __syncthreads()
     struct crossCleanpLS
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -390,6 +393,7 @@ namespace SDL
 
     struct addpT3asTrackCandidatesInGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -425,6 +429,7 @@ namespace SDL
 
     struct addT5asTrackCandidateInGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -462,6 +467,7 @@ namespace SDL
 
     struct addpLSasTrackCandidateInGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -491,6 +497,7 @@ namespace SDL
 
     struct addpT5asTrackCandidateInGPU
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 6df7b06a..045e2ee2 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -1296,6 +1296,7 @@ namespace SDL
 
     struct createTripletsInGPUv2
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -1371,6 +1372,7 @@ namespace SDL
 
     struct createTripletArrayRanges
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -1447,6 +1449,7 @@ namespace SDL
 
     struct addTripletRangesToEventExplicit
     {
+        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,

From 86f750b686548140038aad302886b25c087c5838 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Tue, 13 Jun 2023 10:19:22 -0400
Subject: [PATCH 28/44] move endcap maps to Alpaka temp

---
 SDL/Constants.cuh      |  3 +++
 SDL/EndcapGeometry.cu  | 44 +++++++++++++++++++-----------------------
 SDL/EndcapGeometry.cuh | 10 +++++-----
 SDL/Event.cu           |  4 ++--
 bin/sdl.cc             |  3 ---
 5 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index f6adc16a..3f5d0c3e 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -104,6 +104,9 @@ const unsigned int N_MAX_T3T3_TRACK_EXTENSIONS = 40000;
 
 const unsigned int size_superbins = 45000;
 
+// Temporary fix for endcap buffer allocation.
+const unsigned int endcap_size = 9105;
+
 namespace SDL
 {
     //defining the constant host device variables right up here
diff --git a/SDL/EndcapGeometry.cu b/SDL/EndcapGeometry.cu
index a2bf3d8e..44d31faa 100644
--- a/SDL/EndcapGeometry.cu
+++ b/SDL/EndcapGeometry.cu
@@ -2,11 +2,15 @@
 
 SDL::EndcapGeometry SDL::endcapGeometry;
 
-SDL::EndcapGeometry::EndcapGeometry()
+SDL::EndcapGeometry::EndcapGeometry(unsigned int sizef) :
+    geoMapDetId_buf(allocBufWrapper<unsigned int>(devAcc, sizef)),
+    geoMapPhi_buf(allocBufWrapper<float>(devAcc, sizef))
 {
 }
 
-SDL::EndcapGeometry::EndcapGeometry(std::string filename)
+SDL::EndcapGeometry::EndcapGeometry(std::string filename, unsigned int sizef) :
+    geoMapDetId_buf(allocBufWrapper<unsigned int>(devAcc, sizef)),
+    geoMapPhi_buf(allocBufWrapper<float>(devAcc, sizef))
 {
     load(filename);
 }
@@ -58,30 +62,23 @@ void SDL::EndcapGeometry::load(std::string filename)
         centroid_phis_[detid] = cr;
         centroid_zs_[detid] = cz;
     }
-    CreateGeoMapArraysExplicit();
+
     fillGeoMapArraysExplicit();
 }
 
-void SDL::freeEndCapMapMemory()
+void SDL::EndcapGeometry::fillGeoMapArraysExplicit()
 {
-    cudaFree(SDL::endcapGeometry.geoMapPhi);
-    cudaFree(SDL::endcapGeometry.geoMapDetId);
-}
+    QueueAcc queue(devAcc);
 
-void SDL::EndcapGeometry::CreateGeoMapArraysExplicit()
-{
     int phi_size = centroid_phis_.size();
-    cudaMalloc(&geoMapPhi, phi_size * sizeof(float));
-    cudaMalloc(&geoMapDetId, phi_size * sizeof(unsigned int));
-}
 
-void SDL::EndcapGeometry::fillGeoMapArraysExplicit()
-{
-    float* mapPhi;
-    unsigned int* mapDetId;
-    int phi_size = centroid_phis_.size();
-    cudaMallocHost(&mapPhi, phi_size * sizeof(float));
-    cudaMallocHost(&mapDetId, phi_size * sizeof(unsigned int));
+    // Allocate buffers on host
+    auto mapPhi_host_buf = allocBufWrapper<float>(devHost, phi_size);
+    auto mapDetId_host_buf = allocBufWrapper<unsigned int>(devHost, phi_size);
+
+    // Access the raw pointers of the buffers
+    float* mapPhi = alpaka::getPtrNative(mapPhi_host_buf);
+    unsigned int* mapDetId = alpaka::getPtrNative(mapDetId_host_buf);
 
     unsigned int counter = 0;
     for(auto it = centroid_phis_.begin(); it != centroid_phis_.end(); ++it)
@@ -95,11 +92,10 @@ void SDL::EndcapGeometry::fillGeoMapArraysExplicit()
 
     nEndCapMap = counter;
 
-    cudaMemcpy(geoMapPhi, mapPhi, phi_size*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(geoMapDetId, mapDetId, phi_size*sizeof(unsigned int), cudaMemcpyHostToDevice);
-
-    cudaFreeHost(mapPhi);
-    cudaFreeHost(mapDetId);
+    // Copy data from host to device buffers
+    alpaka::memcpy(queue, geoMapPhi_buf, mapPhi_host_buf, phi_size);
+    alpaka::memcpy(queue, geoMapDetId_buf, mapDetId_host_buf, phi_size);
+    alpaka::wait(queue);
 }
 
 float SDL::EndcapGeometry::getAverageR2(unsigned int detid)
diff --git a/SDL/EndcapGeometry.cuh b/SDL/EndcapGeometry.cuh
index 4ad71b40..f9f33c33 100644
--- a/SDL/EndcapGeometry.cuh
+++ b/SDL/EndcapGeometry.cuh
@@ -15,7 +15,6 @@ namespace SDL
 {
     class EndcapGeometry
     {
-
         private:
             std::map<unsigned int, float> avgr2s_;
             std::map<unsigned int, float> yls_; // lower hits
@@ -27,12 +26,13 @@ namespace SDL
             std::map<unsigned int, float> centroid_zs_; // centroid z
 
         public:
-            unsigned int* geoMapDetId;
-            float* geoMapPhi;
+            Buf<Acc, unsigned int> geoMapDetId_buf;
+            Buf<Acc, float> geoMapPhi_buf;
+
             unsigned int nEndCapMap;
 
-            EndcapGeometry();
-            EndcapGeometry(std::string filename);
+            EndcapGeometry(unsigned int sizef = endcap_size);
+            EndcapGeometry(std::string filename, unsigned int sizef = endcap_size);
             ~EndcapGeometry();
 
             void load(std::string);
diff --git a/SDL/Event.cu b/SDL/Event.cu
index 27a96ac7..3c8aa2c1 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -338,8 +338,8 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
         TwoS,
         nModules,
         SDL::endcapGeometry.nEndCapMap,
-        SDL::endcapGeometry.geoMapDetId,
-        SDL::endcapGeometry.geoMapPhi,
+        alpaka::getPtrNative(SDL::endcapGeometry.geoMapDetId_buf),
+        alpaka::getPtrNative(SDL::endcapGeometry.geoMapPhi_buf),
         *modulesInGPU,
         *hitsInGPU,
         nHits));
diff --git a/bin/sdl.cc b/bin/sdl.cc
index 3088e24b..beb15dc0 100644
--- a/bin/sdl.cc
+++ b/bin/sdl.cc
@@ -350,7 +350,6 @@ void run_sdl()
     // Looping input file
     while (ana.looper.nextEvent())
     {
-        // if (ana.looper.getCurrentEventIndex() ==49) {continue;}
         if (ana.verbose >= 1)
             std::cout << "PreLoading event number = " << ana.looper.getCurrentEventIndex() << std::endl;
 
@@ -390,7 +389,6 @@ void run_sdl()
 
         cudaStreamCreateWithFlags(&streams[s], cudaStreamNonBlocking);
         SDL::Event *event = new SDL::Event(streams[s],ana.verbose>=2);
-        ; //(streams[omp_get_thread_num()]);
         events.push_back(event);
     }
 
@@ -509,7 +507,6 @@ void run_sdl()
     printTimingInformation(timevec, full_elapsed, avg_elapsed);
 
     SDL::cleanModules();
-    SDL::freeEndCapMapMemory();
 
     if (ana.do_write_ntuple)
     {

From 6f2af678c4544280095af087ea54f797caff0261 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Tue, 13 Jun 2023 21:37:49 -0400
Subject: [PATCH 29/44] move modules to Alpaka memory temp

---
 SDL/CachingDeviceAllocator.h  | 722 ----------------------------------
 SDL/CachingHostAllocator.h    | 661 -------------------------------
 SDL/CachingManagedAllocator.h | 662 -------------------------------
 SDL/Constants.cuh             |   8 +
 SDL/Event.cu                  | 424 +++++++-------------
 SDL/Event.cuh                 |  18 +-
 SDL/Module.cu                 | 559 +-------------------------
 SDL/Module.cuh                | 688 +++++++++++++++++++++++++++++---
 SDL/allocate.cc               |  66 ----
 SDL/allocate.h                |  21 -
 SDL/cudaCheck.h               |  61 ---
 SDL/deviceCount.h             |  18 -
 SDL/getCachingAllocator.h     |  75 ----
 bin/sdl.cc                    |   2 -
 code/core/AccessHelper.cc     |   6 +-
 code/core/write_sdl_ntuple.cc |  22 +-
 16 files changed, 790 insertions(+), 3223 deletions(-)
 delete mode 100644 SDL/CachingDeviceAllocator.h
 delete mode 100644 SDL/CachingHostAllocator.h
 delete mode 100644 SDL/CachingManagedAllocator.h
 delete mode 100644 SDL/allocate.cc
 delete mode 100644 SDL/allocate.h
 delete mode 100644 SDL/cudaCheck.h
 delete mode 100644 SDL/deviceCount.h
 delete mode 100644 SDL/getCachingAllocator.h

diff --git a/SDL/CachingDeviceAllocator.h b/SDL/CachingDeviceAllocator.h
deleted file mode 100644
index 666186f7..00000000
--- a/SDL/CachingDeviceAllocator.h
+++ /dev/null
@@ -1,722 +0,0 @@
-#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
-#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * Forked to CMSSW by Matti Kortelainen
- */
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#include <cmath>
-#include <map>
-#include <set>
-
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
-//#include </mnt/data1/dsr/cub/cub/util_debug.cuh>
-//#include </mnt/data1/dsr/cub/cub/host/mutex.cuh>
-
-/// CUB namespace
-namespace notcub {
-
-  /**
- * \addtogroup UtilMgmt
- * @{
- */
-
-  /******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-  /**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and stream-safe and is capable of managing cached
- * device allocations on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
- *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-  struct CachingDeviceAllocator {
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int)-1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t)-1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor {
-      void *d_ptr;                     // Device pointer
-      size_t bytes;                    // Size of allocation in bytes
-      unsigned int bin;                // Bin enumeration
-      int device;                      // device ordinal
-      cudaStream_t associated_stream;  // Associated associated_stream
-      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
-
-      // Constructor (suitable for searching maps for a specific block, given its pointer and device)
-      BlockDescriptor(void *d_ptr1, int device1)
-          : d_ptr(d_ptr1), bytes(0), bin(INVALID_BIN), device(device1), associated_stream(nullptr), ready_event(nullptr) {}
-
-      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
-      BlockDescriptor(int device1)
-          : d_ptr(nullptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device1),
-            associated_stream(nullptr),
-            ready_event(nullptr) {}
-
-      // Comparison functor for comparing device pointers
-      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
-        if (a.device == b.device)
-          return (a.d_ptr < b.d_ptr);
-        else
-          return (a.device < b.device);
-      }
-
-      // Comparison functor for comparing allocation sizes
-      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
-        if (a.device == b.device)
-          return (a.bytes < b.bytes);
-        else
-          return (a.device < b.device);
-      }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(unsigned int base, unsigned int exp) {
-      unsigned int retval = 1;
-      while (exp > 0) {
-        if (exp & 1) {
-          retval = retval * base;  // multiply the result by the current base
-        }
-        base = base * base;  // square the base
-        exp = exp >> 1;      // divide the exponent in half
-      }
-      return retval;
-    }
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
-      power = 0;
-      rounded_bytes = 1;
-
-      if (value * base < value) {
-        // Overflow
-        power = sizeof(size_t) * 8;
-        rounded_bytes = size_t(0) - 1;
-        return;
-      }
-
-      while (rounded_bytes < value) {
-        rounded_bytes *= base;
-        power++;
-      }
-    }
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex mutex;  /// Mutex for thread-safety
-
-    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
-    unsigned int min_bin;     /// Minimum bin enumeration
-    unsigned int max_bin;     /// Maximum bin enumeration
-
-    size_t min_bin_bytes;     /// Minimum bin size
-    size_t max_bin_bytes;     /// Maximum bin size
-    size_t max_cached_bytes;  /// Maximum aggregate cached bytes per device
-
-    const bool
-        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool debug;        /// Whether or not to print (de)allocation events to stdout
-
-    GpuCachedBytes cached_bytes;  /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks cached_blocks;   /// Set of cached device allocations available for reuse
-    BusyBlocks live_blocks;       /// Set of live device allocations currently in use
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int bin_growthx,                 ///< Geometric growth factor for bin-sizes
-        unsigned int min_binx = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int max_binx = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t max_cached_bytesx = INVALID_SIZE,  ///< Maximum aggregate cached bytes per device (default is no limit)
-        bool skip_cleanupx =
-            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool debugx = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-        : bin_growth(bin_growthx),
-          min_bin(min_binx),
-          max_bin(max_binx),
-          min_bin_bytes(IntPow(bin_growthx, min_binx)),
-          max_bin_bytes(IntPow(bin_growthx, max_binx)),
-          max_cached_bytes(max_cached_bytesx),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(bool skip_cleanupx = false, bool debugx = false)
-        : bin_growth(8),
-          min_bin(3),
-          max_bin(7),
-          min_bin_bytes(IntPow(bin_growth, min_bin)),
-          max_bin_bytes(IntPow(bin_growth, max_bin)),
-          max_cached_bytes((max_bin_bytes * 3) - 1),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    cudaError_t SetMaxCachedBytes(size_t max_cached_bytesx) {
-      // Lock
-      mutex.Lock();
-
-      if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytesx);
-
-      this->max_cached_bytes = max_cached_bytesx;
-
-      // Unlock
-      mutex.Unlock();
-
-      return cudaSuccess;
-    }
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        int device,                            ///< [in] Device on which to place the allocation
-        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
-        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
-    {
-      *d_ptr = nullptr;
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      if (device == INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
-        device = entrypoint_device;
-      }
-
-      // Create a block descriptor for the requested allocation
-      bool found = false;
-      BlockDescriptor search_key(device);
-      search_key.associated_stream = active_stream;
-      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-      if (search_key.bin > max_bin) {
-        // Bin is greater than our maximum bin: allocate the request
-        // exactly and give out-of-bounds bin.  It will not be cached
-        // for reuse when returned.
-        search_key.bin = INVALID_BIN;
-        search_key.bytes = bytes;
-      } else {
-        // Search for a suitable cached allocation: lock
-        mutex.Lock();
-
-        if (search_key.bin < min_bin) {
-          // Bin is less than minimum bin: round up
-          search_key.bin = min_bin;
-          search_key.bytes = min_bin_bytes;
-        }
-
-        // Iterate through the range of cached blocks on the same device in the same bin
-        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-        while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
-               (block_itr->bin == search_key.bin)) {
-          // To prevent races with reusing blocks returned by the host but still
-          // in use by the device, only consider cached blocks that are
-          // either (from the active stream) or (from an idle stream)
-          if ((active_stream == block_itr->associated_stream) ||
-              (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
-            // Reuse existing cache block.  Insert into live blocks.
-            found = true;
-            search_key = *block_itr;
-            search_key.associated_stream = active_stream;
-            live_blocks.insert(search_key);
-
-            // Remove from free blocks
-            cached_bytes[device].free -= search_key.bytes;
-            cached_bytes[device].live += search_key.bytes;
-
-            if (debug)
-              // CMS: improved debug message
-              _CubLog(
-                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
-                  "associated with stream %lld, event %lld).\n",
-                  device,
-                  search_key.d_ptr,
-                  (long long)search_key.bytes,
-                  (long long)search_key.associated_stream,
-                  (long long)search_key.ready_event,
-                  (long long)block_itr->associated_stream,
-                  (long long)block_itr->ready_event);
-
-            cached_blocks.erase(block_itr);
-
-            break;
-          }
-          block_itr++;
-        }
-
-        // Done searching: unlock
-        mutex.Unlock();
-      }
-
-      // Allocate the block if necessary
-      if (!found) {
-        // Set runtime's current device to specified device (entrypoint may not be set)
-        if (device != entrypoint_device) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            return error;
-          if (CubDebug(error = cudaSetDevice(device)))
-            return error;
-        }
-
-        // Attempt to allocate
-        if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
-          // The allocation attempt failed: free all cached blocks on device and retry
-          if (debug)
-            _CubLog(
-                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                device,
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream);
-
-          error = cudaSuccess;  // Reset the error we will return
-          cudaGetLastError();   // Reset CUDART's error
-
-          // Lock
-          mutex.Lock();
-
-          // Iterate the range of free blocks on the same device
-          BlockDescriptor free_key(device);
-          CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
-
-          while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
-            // No need to worry about synchronization with the device: cudaFree is
-            // blocking and will synchronize across all kernels executing
-            // on the current device
-
-            // Free device memory and destroy stream event.
-            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
-              break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
-              break;
-
-            // Reduce balance and erase entry
-            cached_bytes[device].free -= block_itr->bytes;
-
-            if (debug)
-              _CubLog(
-                  "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
-                  "(%lld bytes) outstanding.\n",
-                  device,
-                  (long long)block_itr->bytes,
-                  (long long)cached_blocks.size(),
-                  (long long)cached_bytes[device].free,
-                  (long long)live_blocks.size(),
-                  (long long)cached_bytes[device].live);
-
-            cached_blocks.erase(block_itr);
-
-            block_itr++;
-          }
-
-          // Unlock
-          mutex.Unlock();
-
-          // Return under error
-          if (error)
-            return error;
-
-          // Try to allocate again
-          if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
-            return error;
-        }
-
-        // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
-
-        // Insert into live blocks
-        mutex.Lock();
-        live_blocks.insert(search_key);
-        cached_bytes[device].live += search_key.bytes;
-        mutex.Unlock();
-
-        if (debug)
-          // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
-              device,
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event);
-
-        // Attempt to revert back to previous device if necessary
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-          if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-            return error;
-        }
-      }
-
-      // Copy device pointer to output parameter
-      *d_ptr = search_key.d_ptr;
-
-      if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes[device].free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes[device].live);
-
-      return error;
-    }
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
-        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
-    {
-      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
-    }
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(int device, void *d_ptr) {
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      //if (device == INVALID_DEVICE_ORDINAL) {
-      //  if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-      //    return error;
-      //  device = entrypoint_device;
-      //}
-      
-      // Lock
-      mutex.Lock();
-
-      // Find corresponding block descriptor
-      bool recached = false;
-      BlockDescriptor search_key(d_ptr, device);
-      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-     
-      if (block_itr != live_blocks.end()) {
-        // Remove from live blocks
-        search_key = *block_itr;
-        live_blocks.erase(block_itr);
-        cached_bytes[device].live -= search_key.bytes;
-
-        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-        if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
-          // Insert returned allocation into free blocks
-          recached = true;
-          cached_blocks.insert(search_key);
-          cached_bytes[device].free += search_key.bytes;
-
-          if (debug)
-            // CMS: improved debug message
-            _CubLog(
-                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
-                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                device,
-                (long long)search_key.bytes,
-                d_ptr,
-                (long long)search_key.associated_stream,
-                (long long)search_key.ready_event,
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes[device].free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes[device].live);
-        }
-      }
-
-      // First set to specified device (entrypoint may not be set)
-      if (device != entrypoint_device) {
-        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-          return error;
-        if (CubDebug(error = cudaSetDevice(device)))
-          return error;
-      }
-
-      if (recached) {
-        // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
-      }
-
-      // Unlock
-      mutex.Unlock();
-
-      if (!recached) {
-        // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFree(d_ptr))){
-          return error;
-        }
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event))){
-          return error;
-        }
-
-        if (debug)
-          // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
-              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-              device,
-              (long long)search_key.bytes,
-              d_ptr,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes[device].free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes[device].live);
-      }
-
-      // Reset device
-      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached() {
-      cudaError_t error = cudaSuccess;
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      int current_device = INVALID_DEVICE_ORDINAL;
-
-      mutex.Lock();
-
-      while (!cached_blocks.empty()) {
-        // Get first block
-        CachedBlocks::iterator begin = cached_blocks.begin();
-
-        // Get entry-point device ordinal if necessary
-        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            break;
-        }
-
-        // Set current device ordinal if necessary
-        if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
-            break;
-          current_device = begin->device;
-        }
-
-        // Free device memory
-        if (CubDebug(error = cudaFree(begin->d_ptr)))
-          break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
-          break;
-
-        // Reduce balance and erase entry
-        cached_bytes[current_device].free -= begin->bytes;
-
-        if (debug)
-          _CubLog(
-              "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
-              "bytes) outstanding.\n",
-              current_device,
-              (long long)begin->bytes,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes[current_device].free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes[current_device].live);
-
-        cached_blocks.erase(begin);
-      }
-
-      mutex.Unlock();
-
-      // Attempt to revert back to entry-point device if necessary
-      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator() {
-      if (!skip_cleanup)
-        FreeAllCached();
-    }
-  };
-
-  /** @} */  // end group UtilMgmt
-
-}  // namespace notcub
-
-#endif
diff --git a/SDL/CachingHostAllocator.h b/SDL/CachingHostAllocator.h
deleted file mode 100644
index c5ad255b..00000000
--- a/SDL/CachingHostAllocator.h
+++ /dev/null
@@ -1,661 +0,0 @@
-#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
-#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * Modified to cache pinned host allocations by Matti Kortelainen
- */
-
-/******************************************************************************
- * Simple caching allocator for pinned host memory allocations. The allocator is
- * thread-safe.
- ******************************************************************************/
-
-#include <cmath>
-#include <map>
-#include <set>
-
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
-
-/// CUB namespace
-namespace notcub {
-
-  /**
- * \addtogroup UtilMgmt
- * @{
- */
-
-  /******************************************************************************
- * CachingHostAllocator (host use)
- ******************************************************************************/
-
-  /**
- * \brief A simple caching allocator pinned host memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe.  It behaves as follows:
- *
- * I presume the CUDA stream-safeness is not useful as to read/write
- * from/to the pinned host memory one needs to synchronize anyway. The
- * difference wrt. device memory is that in the CPU all operations to
- * the device memory are scheduled via the CUDA stream, while for the
- * host memory one can perform operations directly.
- *
- * \par
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused host allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations  will exceed
- *   \p max_cached_bytes, allocations are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingHostAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes
- *
- */
-  struct CachingHostAllocator {
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int)-1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t)-1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for pinned host memory allocations
-     */
-    struct BlockDescriptor {
-      void *d_ptr;                     // Host pointer
-      size_t bytes;                    // Size of allocation in bytes
-      unsigned int bin;                // Bin enumeration
-      int device;                      // device ordinal
-      cudaStream_t associated_stream;  // Associated associated_stream
-      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
-
-      // Constructor (suitable for searching maps for a specific block, given its pointer)
-      BlockDescriptor(void *d_ptrx)
-          : d_ptr(d_ptrx),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(INVALID_DEVICE_ORDINAL),
-            associated_stream(nullptr),
-            ready_event(nullptr) {}
-
-      // Constructor (suitable for searching maps for a range of suitable blocks)
-      BlockDescriptor()
-          : d_ptr(nullptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(INVALID_DEVICE_ORDINAL),
-            associated_stream(nullptr),
-            ready_event(nullptr) {}
-
-      // Comparison functor for comparing host pointers
-      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
-
-      // Comparison functor for comparing allocation sizes
-      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(unsigned int base, unsigned int exp) {
-      unsigned int retval = 1;
-      while (exp > 0) {
-        if (exp & 1) {
-          retval = retval * base;  // multiply the result by the current base
-        }
-        base = base * base;  // square the base
-        exp = exp >> 1;      // divide the exponent in half
-      }
-      return retval;
-    }
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
-      power = 0;
-      rounded_bytes = 1;
-
-      if (value * base < value) {
-        // Overflow
-        power = sizeof(size_t) * 8;
-        rounded_bytes = size_t(0) - 1;
-        return;
-      }
-
-      while (rounded_bytes < value) {
-        rounded_bytes *= base;
-        power++;
-      }
-    }
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex mutex;  /// Mutex for thread-safety
-
-    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
-    unsigned int min_bin;     /// Minimum bin enumeration
-    unsigned int max_bin;     /// Maximum bin enumeration
-
-    size_t min_bin_bytes;     /// Minimum bin size
-    size_t max_bin_bytes;     /// Maximum bin size
-    size_t max_cached_bytes;  /// Maximum aggregate cached bytes
-
-    const bool
-        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool debug;        /// Whether or not to print (de)allocation events to stdout
-
-    TotalBytes cached_bytes;     /// Aggregate cached bytes
-    CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
-    BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingHostAllocator(
-        unsigned int bin_growthx,                 ///< Geometric growth factor for bin-sizes
-        unsigned int min_binx = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int max_binx = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t max_cached_bytesx = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
-        bool skip_cleanupx =
-            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool debugx = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-        : bin_growth(bin_growthx),
-          min_bin(min_binx),
-          max_bin(max_binx),
-          min_bin_bytes(IntPow(bin_growthx, min_binx)),
-          max_bin_bytes(IntPow(bin_growthx, max_binx)),
-          max_cached_bytes(max_cached_bytesx),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes
-     */
-    CachingHostAllocator(bool skip_cleanupx = false, bool debugx = false)
-        : bin_growth(8),
-          min_bin(3),
-          max_bin(7),
-          min_bin_bytes(IntPow(bin_growth, min_bin)),
-          max_bin_bytes(IntPow(bin_growth, max_bin)),
-          max_cached_bytes((max_bin_bytes * 3) - 1),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    void SetMaxCachedBytes(size_t max_cached_bytesx) {
-      // Lock
-      mutex.Lock();
-
-      if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytesx);
-
-      this->max_cached_bytes = max_cached_bytesx;
-
-      // Unlock
-      mutex.Unlock();
-    }
-
-    /**
-     * \brief Provides a suitable allocation of pinned host memory for the given size.
-     *
-     * Once freed, the allocation becomes available immediately for reuse.
-     */
-    cudaError_t HostAllocate(
-        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
-        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
-    {
-      *d_ptr = nullptr;
-      int device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      if (CubDebug(error = cudaGetDevice(&device)))
-        return error;
-
-      // Create a block descriptor for the requested allocation
-      bool found = false;
-      BlockDescriptor search_key;
-      search_key.device = device;
-      search_key.associated_stream = active_stream;
-      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-      if (search_key.bin > max_bin) {
-        // Bin is greater than our maximum bin: allocate the request
-        // exactly and give out-of-bounds bin.  It will not be cached
-        // for reuse when returned.
-        search_key.bin = INVALID_BIN;
-        search_key.bytes = bytes;
-      } else {
-        // Search for a suitable cached allocation: lock
-        mutex.Lock();
-
-        if (search_key.bin < min_bin) {
-          // Bin is less than minimum bin: round up
-          search_key.bin = min_bin;
-          search_key.bytes = min_bin_bytes;
-        }
-
-        // Iterate through the range of cached blocks in the same bin
-        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
-          // To prevent races with reusing blocks returned by the host but still
-          // in use for transfers, only consider cached blocks that are from an idle stream
-          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
-            // Reuse existing cache block.  Insert into live blocks.
-            found = true;
-            search_key = *block_itr;
-            search_key.associated_stream = active_stream;
-            if (search_key.device != device) {
-              // If "associated" device changes, need to re-create the event on the right device
-              if (CubDebug(error = cudaSetDevice(search_key.device)))
-                return error;
-              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-                return error;
-              if (CubDebug(error = cudaSetDevice(device)))
-                return error;
-              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
-              search_key.device = device;
-            }
-
-            live_blocks.insert(search_key);
-
-            // Remove from free blocks
-            cached_bytes.free -= search_key.bytes;
-            cached_bytes.live += search_key.bytes;
-
-            if (debug)
-              _CubLog(
-                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
-                  "(previously associated with stream %lld, event %lld).\n",
-                  search_key.d_ptr,
-                  (long long)search_key.bytes,
-                  (long long)search_key.associated_stream,
-                  (long long)search_key.ready_event,
-                  (long long)search_key.device,
-                  (long long)block_itr->associated_stream,
-                  (long long)block_itr->ready_event);
-
-            cached_blocks.erase(block_itr);
-
-            break;
-          }
-          block_itr++;
-        }
-
-        // Done searching: unlock
-        mutex.Unlock();
-      }
-
-      // Allocate the block if necessary
-      if (!found) {
-        // Attempt to allocate
-        // TODO: eventually support allocation flags
-        if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
-            cudaErrorMemoryAllocation) {
-          // The allocation attempt failed: free all cached blocks on device and retry
-          if (debug)
-            _CubLog(
-                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
-                "allocations",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.device);
-
-          error = cudaSuccess;  // Reset the error we will return
-          cudaGetLastError();   // Reset CUDART's error
-
-          // Lock
-          mutex.Lock();
-
-          // Iterate the range of free blocks
-          CachedBlocks::iterator block_itr = cached_blocks.begin();
-
-          while ((block_itr != cached_blocks.end())) {
-            // No need to worry about synchronization with the device: cudaFree is
-            // blocking and will synchronize across all kernels executing
-            // on the current device
-
-            // Free pinned host memory.
-            if (CubDebug(error = cudaFreeHost(block_itr->d_ptr)))
-              break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
-              break;
-
-            // Reduce balance and erase entry
-            cached_bytes.free -= block_itr->bytes;
-
-            if (debug)
-              _CubLog(
-                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
-                  "bytes) outstanding.\n",
-                  (long long)block_itr->bytes,
-                  (long long)cached_blocks.size(),
-                  (long long)cached_bytes.free,
-                  (long long)live_blocks.size(),
-                  (long long)cached_bytes.live);
-
-            cached_blocks.erase(block_itr);
-
-            block_itr++;
-          }
-
-          // Unlock
-          mutex.Unlock();
-
-          // Return under error
-          if (error)
-            return error;
-
-          // Try to allocate again
-          if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)))
-            return error;
-        }
-
-        // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
-
-        // Insert into live blocks
-        mutex.Lock();
-        live_blocks.insert(search_key);
-        cached_bytes.live += search_key.bytes;
-        mutex.Unlock();
-
-        if (debug)
-          _CubLog(
-              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
-              "%lld).\n",
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device);
-      }
-
-      // Copy host pointer to output parameter
-      *d_ptr = search_key.d_ptr;
-
-      if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
-
-      return error;
-    }
-
-    /**
-     * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse.
-     */
-    cudaError_t HostFree(void *d_ptr) {
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      // Lock
-      mutex.Lock();
-
-      // Find corresponding block descriptor
-      bool recached = false;
-      BlockDescriptor search_key(d_ptr);
-      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-      if (block_itr != live_blocks.end()) {
-        // Remove from live blocks
-        search_key = *block_itr;
-        live_blocks.erase(block_itr);
-        cached_bytes.live -= search_key.bytes;
-
-        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
-          // Insert returned allocation into free blocks
-          recached = true;
-          cached_blocks.insert(search_key);
-          cached_bytes.free += search_key.bytes;
-
-          if (debug)
-            _CubLog(
-                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
-                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.ready_event,
-                (long long)search_key.device,
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
-        }
-      }
-
-      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-        return error;
-      if (entrypoint_device != search_key.device) {
-        if (CubDebug(error = cudaSetDevice(search_key.device)))
-          return error;
-      }
-
-      if (recached) {
-        // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
-      }
-
-      // Unlock
-      mutex.Unlock();
-
-      if (!recached) {
-        // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFreeHost(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
-
-        if (debug)
-          _CubLog(
-              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
-              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes.free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes.live);
-      }
-
-      // Reset device
-      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Frees all cached pinned host allocations
-     */
-    cudaError_t FreeAllCached() {
-      cudaError_t error = cudaSuccess;
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      int current_device = INVALID_DEVICE_ORDINAL;
-
-      mutex.Lock();
-
-      while (!cached_blocks.empty()) {
-        // Get first block
-        CachedBlocks::iterator begin = cached_blocks.begin();
-
-        // Get entry-point device ordinal if necessary
-        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            break;
-        }
-
-        // Set current device ordinal if necessary
-        if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
-            break;
-          current_device = begin->device;
-        }
-
-        // Free host memory
-        if (CubDebug(error = cudaFreeHost(begin->d_ptr)))
-          break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
-          break;
-
-        // Reduce balance and erase entry
-        cached_bytes.free -= begin->bytes;
-
-        if (debug)
-          _CubLog(
-              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
-              "bytes) outstanding.\n",
-              (long long)begin->bytes,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes.free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes.live);
-
-        cached_blocks.erase(begin);
-      }
-
-      mutex.Unlock();
-
-      // Attempt to revert back to entry-point device if necessary
-      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Destructor
-     */
-    ~CachingHostAllocator() {
-      if (!skip_cleanup)
-        FreeAllCached();
-    }
-  };
-
-  /** @} */  // end group UtilMgmt
-
-}  // namespace notcub
-
-#endif
diff --git a/SDL/CachingManagedAllocator.h b/SDL/CachingManagedAllocator.h
deleted file mode 100644
index 6830be63..00000000
--- a/SDL/CachingManagedAllocator.h
+++ /dev/null
@@ -1,662 +0,0 @@
-#ifndef HeterogenousCore_CUDAUtilities_src_CachingManagedAllocator_h
-#define HeterogenousCore_CUDAUtilities_src_CachingManagedAllocator_h
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * Modified to cache managed memory allocations by Matti Kortelainen
- */
-
-/******************************************************************************
- * Simple caching allocator for managed memory allocations. The allocator is
- * thread-safe.
- ******************************************************************************/
-
-#include <cmath>
-#include <map>
-#include <set>
-
-#include <cub/util_debug.cuh>
-#include <cub/host/mutex.cuh>
-//#include </mnt/data1/dsr/cub/cub/util_debug.cuh>
-//#include </mnt/data1/dsr/cub/cub/host/mutex.cuh>
-
-/// CUB namespace
-namespace notcub {
-
-  /**
- * \addtogroup UtilMgmt
- * @{
- */
-
-  /******************************************************************************
- * CachingManagedAllocator (host use)
- ******************************************************************************/
-
-  /**
- * \brief A simple caching allocator managed memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe.  It behaves as follows:
- *
- * I presume the CUDA stream-safeness is not useful as to read/write
- * from/to the managed memory one needs to synchronize anyway. The
- * difference wrt. device memory is that in the CPU all operations to
- * the device memory are scheduled via the CUDA stream, while for the
- * managed memory one can perform operations directly.
- *
- * \par
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused host allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations  will exceed
- *   \p max_cached_bytes, allocations are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingHostAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes
- *
- */
-  struct CachingManagedAllocator {
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int)-1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t)-1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for pinned managed memory allocations
-     */
-    struct BlockDescriptor {
-      void *d_ptr;                     // Managed pointer
-      size_t bytes;                    // Size of allocation in bytes
-      unsigned int bin;                // Bin enumeration
-      int device;                      // device ordinal
-      cudaStream_t associated_stream;  // Associated associated_stream
-      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
-
-      // Constructor (suitable for searching maps for a specific block, given its pointer)
-      BlockDescriptor(void *d_ptrx)
-          : d_ptr(d_ptrx),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(INVALID_DEVICE_ORDINAL),
-            associated_stream(nullptr),
-            ready_event(nullptr) {}
-
-      // Constructor (suitable for searching maps for a range of suitable blocks)
-      BlockDescriptor()
-          : d_ptr(nullptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(INVALID_DEVICE_ORDINAL),
-            associated_stream(nullptr),
-            ready_event(nullptr) {}
-
-      // Comparison functor for comparing managed pointers
-      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
-
-      // Comparison functor for comparing allocation sizes
-      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-      size_t free;
-      size_t live;
-      TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(unsigned int base, unsigned int exp) {
-      unsigned int retval = 1;
-      while (exp > 0) {
-        if (exp & 1) {
-          retval = retval * base;  // multiply the result by the current base
-        }
-        base = base * base;  // square the base
-        exp = exp >> 1;      // divide the exponent in half
-      }
-      return retval;
-    }
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
-      power = 0;
-      rounded_bytes = 1;
-
-      if (value * base < value) {
-        // Overflow
-        power = sizeof(size_t) * 8;
-        rounded_bytes = size_t(0) - 1;
-        return;
-      }
-
-      while (rounded_bytes < value) {
-        rounded_bytes *= base;
-        power++;
-      }
-    }
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex mutex;  /// Mutex for thread-safety
-
-    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
-    unsigned int min_bin;     /// Minimum bin enumeration
-    unsigned int max_bin;     /// Maximum bin enumeration
-
-    size_t min_bin_bytes;     /// Minimum bin size
-    size_t max_bin_bytes;     /// Maximum bin size
-    size_t max_cached_bytes;  /// Maximum aggregate cached bytes
-
-    const bool
-        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool debug;        /// Whether or not to print (de)allocation events to stdout
-
-    TotalBytes cached_bytes;     /// Aggregate cached bytes
-    CachedBlocks cached_blocks;  /// Set of cached managed memory allocations available for reuse
-    BusyBlocks live_blocks;      /// Set of live managed memory allocations currently in use
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingManagedAllocator(
-        unsigned int bin_growthx,                 ///< Geometric growth factor for bin-sizes
-        unsigned int min_binx = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int max_binx = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t max_cached_bytesx = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
-        bool skip_cleanupx =
-            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool debugx = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-        : bin_growth(bin_growthx),
-          min_bin(min_binx),
-          max_bin(max_binx),
-          min_bin_bytes(IntPow(bin_growthx, min_binx)),
-          max_bin_bytes(IntPow(bin_growthx, max_binx)),
-          max_cached_bytes(max_cached_bytesx),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes
-     */
-    CachingManagedAllocator(bool skip_cleanupx = false, bool debugx = false)
-        : bin_growth(8),
-          min_bin(3),
-          max_bin(7),
-          min_bin_bytes(IntPow(bin_growth, min_bin)),
-          max_bin_bytes(IntPow(bin_growth, max_bin)),
-          max_cached_bytes((max_bin_bytes * 3) - 1),
-          skip_cleanup(skip_cleanupx),
-          debug(debugx),
-          cached_blocks(BlockDescriptor::SizeCompare),
-          live_blocks(BlockDescriptor::PtrCompare) {}
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    void SetMaxCachedBytes(size_t max_cached_bytesx) {
-      // Lock
-      mutex.Lock();
-
-      if (debug)
-        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
-                (long long)this->max_cached_bytes,
-                (long long)max_cached_bytesx);
-
-      this->max_cached_bytes = max_cached_bytesx;
-
-      // Unlock
-      mutex.Unlock();
-    }
-
-    /**
-     * \brief Provides a suitable allocation of managed memory for the given size.
-     *
-     * Once freed, the allocation becomes available immediately for reuse.
-     */
-    cudaError_t ManagedAllocate(
-        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
-        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
-    {
-      *d_ptr = nullptr;
-      int device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      if (CubDebug(error = cudaGetDevice(&device)))
-        return error;
-
-      // Create a block descriptor for the requested allocation
-      bool found = false;
-      BlockDescriptor search_key;
-      search_key.device = device;
-      search_key.associated_stream = active_stream;
-      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-      if (search_key.bin > max_bin) {
-        // Bin is greater than our maximum bin: allocate the request
-        // exactly and give out-of-bounds bin.  It will not be cached
-        // for reuse when returned.
-        search_key.bin = INVALID_BIN;
-        search_key.bytes = bytes;
-      } else {
-        // Search for a suitable cached allocation: lock
-        mutex.Lock();
-
-        if (search_key.bin < min_bin) {
-          // Bin is less than minimum bin: round up
-          search_key.bin = min_bin;
-          search_key.bytes = min_bin_bytes;
-        }
-
-        // Iterate through the range of cached blocks in the same bin
-        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
-          // To prevent races with reusing blocks returned by the host but still
-          // in use for transfers, only consider cached blocks that are from an idle stream
-          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
-            // Reuse existing cache block.  Insert into live blocks.
-            found = true;
-            search_key = *block_itr;
-            search_key.associated_stream = active_stream;
-            if (search_key.device != device) {
-              // If "associated" device changes, need to re-create the event on the right device
-              if (CubDebug(error = cudaSetDevice(search_key.device)))
-                return error;
-              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-                return error;
-              if (CubDebug(error = cudaSetDevice(device)))
-                return error;
-              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
-              search_key.device = device;
-            }
-
-            live_blocks.insert(search_key);
-
-            // Remove from free blocks
-            cached_bytes.free -= search_key.bytes;
-            cached_bytes.live += search_key.bytes;
-
-            if (debug)
-              _CubLog(
-                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
-                  "(previously associated with stream %lld, event %lld).\n",
-                  search_key.d_ptr,
-                  (long long)search_key.bytes,
-                  (long long)search_key.associated_stream,
-                  (long long)search_key.ready_event,
-                  (long long)search_key.device,
-                  (long long)block_itr->associated_stream,
-                  (long long)block_itr->ready_event);
-
-            cached_blocks.erase(block_itr);
-
-            break;
-          }
-          block_itr++;
-        }
-
-        // Done searching: unlock
-        mutex.Unlock();
-      }
-
-      // Allocate the block if necessary
-      if (!found) {
-        // Attempt to allocate
-        // TODO: eventually support allocation flags
-        if (CubDebug(error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
-          // The allocation attempt failed: free all cached blocks on device and retry
-          if (debug)
-            _CubLog(
-                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
-                "allocations",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.device);
-
-          error = cudaSuccess;  // Reset the error we will return
-          cudaGetLastError();   // Reset CUDART's error
-
-          // Lock
-          mutex.Lock();
-
-          // Iterate the range of free blocks
-          CachedBlocks::iterator block_itr = cached_blocks.begin();
-
-          while ((block_itr != cached_blocks.end())) {
-            // No need to worry about synchronization with the device: cudaFree is
-            // blocking and will synchronize across all kernels executing
-            // on the current device
-
-            // Free managed memory.
-            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
-              break;
-            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
-              break;
-
-            // Reduce balance and erase entry
-            cached_bytes.free -= block_itr->bytes;
-
-            if (debug)
-              _CubLog(
-                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
-                  "bytes) outstanding.\n",
-                  (long long)block_itr->bytes,
-                  (long long)cached_blocks.size(),
-                  (long long)cached_bytes.free,
-                  (long long)live_blocks.size(),
-                  (long long)cached_bytes.live);
-
-            cached_blocks.erase(block_itr);
-
-            block_itr++;
-          }
-
-          // Unlock
-          mutex.Unlock();
-
-          // Return under error
-          if (error)
-            return error;
-
-          // Try to allocate again
-          if (CubDebug(error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes)))
-            return error;
-        }
-
-        // Create ready event
-        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-          return error;
-
-        // Insert into live blocks
-        mutex.Lock();
-        live_blocks.insert(search_key);
-        cached_bytes.live += search_key.bytes;
-        mutex.Unlock();
-
-        if (debug)
-          _CubLog(
-              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
-              "%lld).\n",
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device);
-      }
-
-      // Copy host pointer to output parameter
-      *d_ptr = search_key.d_ptr;
-
-      if (debug)
-        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
-
-      return error;
-    }
-
-    /**
-     * \brief Frees a live allocation of managed memory, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse.
-     */
-    cudaError_t ManagedFree(void *d_ptr) {
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      cudaError_t error = cudaSuccess;
-
-      // Lock
-      mutex.Lock();
-
-      // Find corresponding block descriptor
-      bool recached = false;
-      BlockDescriptor search_key(d_ptr);
-      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-      if (block_itr != live_blocks.end()) {
-        // Remove from live blocks
-        search_key = *block_itr;
-        live_blocks.erase(block_itr);
-        cached_bytes.live -= search_key.bytes;
-
-        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
-          // Insert returned allocation into free blocks
-          recached = true;
-          cached_blocks.insert(search_key);
-          cached_bytes.free += search_key.bytes;
-
-          if (debug)
-            _CubLog(
-                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
-                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.ready_event,
-                (long long)search_key.device,
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
-        }
-      }
-
-      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-        return error;
-      if (entrypoint_device != search_key.device) {
-        if (CubDebug(error = cudaSetDevice(search_key.device)))
-          return error;
-      }
-
-      if (recached) {
-        // Insert the ready event in the associated stream (must have current device set properly)
-        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
-          return error;
-      }
-
-      // Unlock
-      mutex.Unlock();
-
-      if (!recached) {
-        // Free the allocation from the runtime and cleanup the event.
-        if (CubDebug(error = cudaFree(d_ptr)))
-          return error;
-        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
-          return error;
-
-        if (debug)
-          _CubLog(
-              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
-              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes.free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes.live);
-      }
-
-      // Reset device
-      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Frees all cached managed memory allocations
-     */
-    cudaError_t FreeAllCached() {
-      cudaError_t error = cudaSuccess;
-      int entrypoint_device = INVALID_DEVICE_ORDINAL;
-      int current_device = INVALID_DEVICE_ORDINAL;
-
-      mutex.Lock();
-
-      while (!cached_blocks.empty()) {
-        // Get first block
-        CachedBlocks::iterator begin = cached_blocks.begin();
-
-        // Get entry-point device ordinal if necessary
-        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
-          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-            break;
-        }
-
-        // Set current device ordinal if necessary
-        if (begin->device != current_device) {
-          if (CubDebug(error = cudaSetDevice(begin->device)))
-            break;
-          current_device = begin->device;
-        }
-
-        // Free managed memory
-        if (CubDebug(error = cudaFree(begin->d_ptr)))
-          break;
-        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
-          break;
-
-        // Reduce balance and erase entry
-        cached_bytes.free -= begin->bytes;
-
-        if (debug)
-          _CubLog(
-              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
-              "bytes) outstanding.\n",
-              (long long)begin->bytes,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes.free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes.live);
-
-        cached_blocks.erase(begin);
-      }
-
-      mutex.Unlock();
-
-      // Attempt to revert back to entry-point device if necessary
-      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
-        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
-          return error;
-      }
-
-      return error;
-    }
-
-    /**
-     * \brief Destructor
-     */
-    ~CachingManagedAllocator() {
-      if (!skip_cleanup)
-        FreeAllCached();
-    }
-  };
-
-  /** @} */  // end group UtilMgmt
-
-}  // namespace notcub
-
-#endif
diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 3f5d0c3e..49412d68 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -4,6 +4,10 @@
 #include <cuda_fp16.h>
 #include <alpaka/alpaka.hpp>
 
+// CUDA headers. Will be removed soon.
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 #ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
 #define __F2H __float2half  
 #define __H2F __half2float  
@@ -107,6 +111,10 @@ const unsigned int size_superbins = 45000;
 // Temporary fix for endcap buffer allocation.
 const unsigned int endcap_size = 9105;
 
+// Temporary fix for module buffer allocation.
+const unsigned int modules_size = 26401;
+const unsigned int pix_tot = 1796504;
+
 namespace SDL
 {
     //defining the constant host device variables right up here
diff --git a/SDL/Event.cu b/SDL/Event.cu
index 3c8aa2c1..eab44436 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -1,16 +1,13 @@
 #include "Event.cuh"
 
-struct SDL::modules* SDL::modulesInGPU = nullptr;
-std::unique_ptr<SDL::pixelMap> SDL::pixelMapping = std::make_unique<pixelMap>();
+std::shared_ptr<SDL::modules> SDL::modulesInGPU = std::make_shared<modules>();
+std::shared_ptr<SDL::modulesBuffer<Acc>> SDL::modulesBuffers = std::make_shared<modulesBuffer<Acc>>(devAcc);
+std::shared_ptr<SDL::pixelMap> SDL::pixelMapping = std::make_shared<pixelMap>();
 uint16_t SDL::nModules;
 uint16_t SDL::nLowerModules;
 
 SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx<Acc>(0u))
 {
-    int version;
-    int driver;
-    cudaRuntimeGetVersion(&version);
-    cudaDriverGetVersion(&driver);
     stream = estream;
     addObjects = verbose;
     hitsInGPU = nullptr;
@@ -36,7 +33,7 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
     pixelQuintupletsInCPU = nullptr;
 
     //reset the arrays
-    for(int i = 0; i<6; i++)
+    for(int i = 0; i < 6; i++)
     {
         n_hits_by_layer_barrel_[i] = 0;
         n_minidoublets_by_layer_barrel_[i] = 0;
@@ -44,7 +41,7 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
         n_triplets_by_layer_barrel_[i] = 0;
         n_trackCandidates_by_layer_barrel_[i] = 0;
         n_quintuplets_by_layer_barrel_[i] = 0;
-        if(i<5)
+        if(i < 5)
         {
             n_hits_by_layer_endcap_[i] = 0;
             n_minidoublets_by_layer_endcap_[i] = 0;
@@ -56,99 +53,10 @@ SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx
     }
 }
 
-SDL::Event::~Event()
-{
-    if(rangesInGPU != nullptr){delete rangesInGPU; delete rangesBuffers;}
-    if(mdsInGPU != nullptr){delete mdsInGPU; delete miniDoubletsBuffers;}
-    if(segmentsInGPU != nullptr){delete segmentsInGPU; delete segmentsBuffers;}
-    if(tripletsInGPU!= nullptr){delete tripletsInGPU; delete tripletsBuffers;}
-    if(trackCandidatesInGPU!= nullptr){delete trackCandidatesInGPU; delete trackCandidatesBuffers;}
-    if(hitsInGPU!= nullptr){delete hitsInGPU; delete hitsBuffers;}
-    if(pixelTripletsInGPU!= nullptr){delete pixelTripletsInGPU; delete pixelTripletsBuffers;}
-    if(pixelQuintupletsInGPU!= nullptr){delete pixelQuintupletsInGPU; delete pixelQuintupletsBuffers;}
-    if(quintupletsInGPU!= nullptr){delete quintupletsInGPU; delete quintupletsBuffers;}
-
-    if(hitsInCPU != nullptr)
-    {
-        delete hitsInCPU;
-    }
-    if(rangesInCPU != nullptr)
-    {
-        delete rangesInCPU;
-    }
-    if(mdsInCPU != nullptr)
-    {
-        delete mdsInCPU;
-    }
-    if(segmentsInCPU != nullptr)
-    {
-        delete segmentsInCPU;
-    }
-    if(tripletsInCPU != nullptr)
-    {
-        delete tripletsInCPU;
-    }
-    if(quintupletsInCPU != nullptr)
-    {
-        delete quintupletsInCPU;
-    }
-    if(pixelTripletsInCPU != nullptr)
-    {
-        delete pixelTripletsInCPU;
-    }
-    if(pixelQuintupletsInCPU != nullptr)
-    {
-        delete pixelQuintupletsInCPU;
-    }
-    if(trackCandidatesInCPU != nullptr)
-    {
-        delete trackCandidatesInCPU;
-    }
-    if(modulesInCPU != nullptr)
-    {
-        delete[] modulesInCPU->nLowerModules;
-        delete[] modulesInCPU->nModules;
-        delete[] modulesInCPU->detIds;
-        delete[] modulesInCPU->isLower;
-        delete[] modulesInCPU->layers;
-        delete[] modulesInCPU->subdets;
-        delete[] modulesInCPU->rings;
-        delete[] modulesInCPU->rods;
-        delete[] modulesInCPU->modules;
-        delete[] modulesInCPU->sides;
-        delete[] modulesInCPU->eta;
-        delete[] modulesInCPU->r;
-        delete[] modulesInCPU;
-    }
-    if(modulesInCPUFull != nullptr)
-    {
-        delete[] modulesInCPUFull->detIds;
-        delete[] modulesInCPUFull->moduleMap;
-        delete[] modulesInCPUFull->nConnectedModules;
-        delete[] modulesInCPUFull->drdzs;
-        delete[] modulesInCPUFull->slopes;
-        delete[] modulesInCPUFull->nModules;
-        delete[] modulesInCPUFull->nLowerModules;
-        delete[] modulesInCPUFull->layers;
-        delete[] modulesInCPUFull->rings;
-        delete[] modulesInCPUFull->modules;
-        delete[] modulesInCPUFull->rods;
-        delete[] modulesInCPUFull->subdets;
-        delete[] modulesInCPUFull->sides;
-        delete[] modulesInCPUFull->eta;
-        delete[] modulesInCPUFull->r;
-        delete[] modulesInCPUFull->isInverted;
-        delete[] modulesInCPUFull->isLower;
-        delete[] modulesInCPUFull->moduleType;
-        delete[] modulesInCPUFull->moduleLayerType;
-        delete[] modulesInCPUFull;
-    }
-}
-
 void SDL::Event::resetEvent()
 {
     //reset the arrays
-    for(int i = 0; i<6; i++)
+    for(int i = 0; i < 6; i++)
     {
         n_hits_by_layer_barrel_[i] = 0;
         n_minidoublets_by_layer_barrel_[i] = 0;
@@ -156,7 +64,7 @@ void SDL::Event::resetEvent()
         n_triplets_by_layer_barrel_[i] = 0;
         n_trackCandidates_by_layer_barrel_[i] = 0;
         n_quintuplets_by_layer_barrel_[i] = 0;
-        if(i<5)
+        if(i < 5)
         {
             n_hits_by_layer_endcap_[i] = 0;
             n_minidoublets_by_layer_endcap_[i] = 0;
@@ -232,43 +140,12 @@ void SDL::Event::resetEvent()
     }
     if(modulesInCPU != nullptr)
     {
-        delete[] modulesInCPU->nLowerModules;
-        delete[] modulesInCPU->nModules;
-        delete[] modulesInCPU->detIds;
-        delete[] modulesInCPU->isLower;
-        delete[] modulesInCPU->layers;
-        delete[] modulesInCPU->subdets;
-        delete[] modulesInCPU->rings;
-        delete[] modulesInCPU->rods;
-        delete[] modulesInCPU->modules;
-        delete[] modulesInCPU->sides;
-        delete[] modulesInCPU->eta;
-        delete[] modulesInCPU->r;
-        delete[] modulesInCPU;
+        delete modulesInCPU;
         modulesInCPU = nullptr;
     }
     if(modulesInCPUFull != nullptr)
     {
-        delete[] modulesInCPUFull->detIds;
-        delete[] modulesInCPUFull->moduleMap;
-        delete[] modulesInCPUFull->nConnectedModules;
-        delete[] modulesInCPUFull->drdzs;
-        delete[] modulesInCPUFull->slopes;
-        delete[] modulesInCPUFull->nModules;
-        delete[] modulesInCPUFull->nLowerModules;
-        delete[] modulesInCPUFull->layers;
-        delete[] modulesInCPUFull->rings;
-        delete[] modulesInCPUFull->modules;
-        delete[] modulesInCPUFull->rods;
-        delete[] modulesInCPUFull->sides;
-        delete[] modulesInCPUFull->subdets;
-        delete[] modulesInCPUFull->eta;
-        delete[] modulesInCPUFull->r;
-        delete[] modulesInCPUFull->isInverted;
-        delete[] modulesInCPUFull->isLower;
-        delete[] modulesInCPUFull->moduleType;
-        delete[] modulesInCPUFull->moduleLayerType;
-        delete[] modulesInCPUFull;
+        delete modulesInCPUFull;
         modulesInCPUFull = nullptr;
     }
 }
@@ -276,18 +153,20 @@ void SDL::Event::resetEvent()
 void SDL::initModules(const char* moduleMetaDataFilePath)
 {
     cudaStream_t default_stream = 0;
-    if(modulesInGPU == nullptr)
-    {
-        cudaMallocHost(&modulesInGPU, sizeof(struct SDL::modules));
-        //nModules gets filled here
-        loadModulesFromFile(*modulesInGPU,nModules,nLowerModules, *pixelMapping, default_stream, moduleMetaDataFilePath);
-    }
-}
-
-void SDL::cleanModules()
-{
-    freeModules(*modulesInGPU);
-    cudaFreeHost(modulesInGPU);
+    QueueAcc queue(devAcc);
+
+    // Set the relevant data pointers.
+    modulesInGPU->setData(*modulesBuffers);
+
+    // nModules gets filled here
+    loadModulesFromFile(modulesInGPU.get(),
+                        modulesBuffers.get(),
+                        nModules,
+                        nLowerModules,
+                        *pixelMapping,
+                        default_stream,
+                        queue,
+                        moduleMetaDataFilePath);
 }
 
 void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple)
@@ -299,9 +178,6 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
     *alpaka::getPtrNative(nHits_buf) = nHits;
 
-    // Get current device for future use.
-    cudaGetDevice(&dev);
-
     // Initialize space on device/host for next event.
     if (hitsInGPU == nullptr)
     {
@@ -492,22 +368,23 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
 
 void SDL::Event::addMiniDoubletsToEventExplicit()
 {
-    unsigned int* nMDsCPU;
-    nMDsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(nMDsCPU,mdsInGPU->nMDs,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nMDsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules);
 
-    short* module_subdets;
-    module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream);
-    cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    short* module_layers;
-    module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream);
-    cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    int* module_hitRanges;
-    module_hitRanges = (int*)cms::cuda::allocate_host(nLowerModules* 2*sizeof(int), stream);
-    cudaMemcpyAsync(module_hitRanges,hitsInGPU->hitRanges,nLowerModules*2*sizeof(int),cudaMemcpyDeviceToHost,stream);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
 
-    cudaStreamSynchronize(stream);
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    auto module_hitRanges_buf = allocBufWrapper<int>(devHost, nLowerModules*2);
+    alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2);
+
+    alpaka::wait(queue);
+    int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
+    int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf);
 
     for(unsigned int i = 0; i<nLowerModules; i++)
     {
@@ -524,27 +401,24 @@ void SDL::Event::addMiniDoubletsToEventExplicit()
 
         }
     }
-
-    cms::cuda::free_host(nMDsCPU);
-    cms::cuda::free_host(module_subdets);
-    cms::cuda::free_host(module_layers);
-    cms::cuda::free_host(module_hitRanges);
 }
 
 void SDL::Event::addSegmentsToEventExplicit()
 {
-    unsigned int* nSegmentsCPU;
-    nSegmentsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(nSegmentsCPU,segmentsInGPU->nSegments,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
+    auto nSegmentsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules);
 
-    short* module_subdets;
-    module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream);
-    cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    short* module_layers;
-    module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream);
-    cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
+
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    alpaka::wait(queue);
+    int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
 
-    cudaStreamSynchronize(stream);
     for(unsigned int i = 0; i<nLowerModules; i++)
     {
         if(!(nSegmentsCPU[i] == 0))
@@ -559,10 +433,6 @@ void SDL::Event::addSegmentsToEventExplicit()
             }
         }
     }
-
-    cms::cuda::free_host(nSegmentsCPU);
-    cms::cuda::free_host(module_subdets);
-    cms::cuda::free_host(module_layers);
 }
 
 void SDL::Event::createMiniDoublets()
@@ -716,25 +586,27 @@ void SDL::Event::createTriplets()
 
     uint16_t nonZeroModules = 0;
     unsigned int max_InnerSeg = 0;
-    
+
     // Allocate host index
     auto index_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules);
     uint16_t *index = alpaka::getPtrNative(index_buf);
-    
+
     // Allocate device index
     auto index_gpu_buf = allocBufWrapper<uint16_t>(devAcc, nLowerModules);
-    
+
     // Allocate and copy nSegments from device to host
     auto nSegments_buf = allocBufWrapper<int>(devHost, nLowerModules);
     alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules);
     alpaka::wait(queue);
 
     int *nSegments = alpaka::getPtrNative(nSegments_buf);
-    
-    uint16_t* module_nConnectedModules;
-    module_nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nLowerModules* sizeof(uint16_t), stream);
-    cudaMemcpyAsync(module_nConnectedModules,modulesInGPU->nConnectedModules,nLowerModules*sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+
+    // Allocate and copy module_nConnectedModules from device to host
+    auto module_nConnectedModules_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nLowerModules);
+    alpaka::wait(queue);
+
+    uint16_t* module_nConnectedModules = alpaka::getPtrNative(module_nConnectedModules_buf);
 
     for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex < nLowerModules; innerLowerModuleIndex++)
     {
@@ -752,8 +624,6 @@ void SDL::Event::createTriplets()
     alpaka::memcpy(queue, index_gpu_buf, index_buf, nonZeroModules);
     alpaka::wait(queue);
 
-    cms::cuda::free_host(module_nConnectedModules);
-
     Vec const threadsPerBlockCreateTrip(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(16));
     Vec const blocksPerGridCreateTrip(static_cast<Idx>(MAX_BLOCKS), static_cast<Idx>(1), static_cast<Idx>(1));
     WorkDiv const createTripletsInGPUv2_workDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread);
@@ -1296,20 +1166,24 @@ void SDL::Event::createPixelQuintuplets()
 
 void SDL::Event::addQuintupletsToEventExplicit()
 {
-    unsigned int* nQuintupletsCPU;
-    nQuintupletsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(nQuintupletsCPU,quintupletsInGPU->nQuintuplets,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-
-    short* module_subdets;
-    module_subdets = (short*)cms::cuda::allocate_host(nModules* sizeof(short), stream);
-    cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    short* module_layers;
-    module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream);
-    cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    int* module_quintupletModuleIndices;
-    module_quintupletModuleIndices = (int*)cms::cuda::allocate_host(nLowerModules * sizeof(int), stream);
-    cudaMemcpyAsync(module_quintupletModuleIndices, rangesInGPU->quintupletModuleIndices, nLowerModules * sizeof(int), cudaMemcpyDeviceToHost,stream);
-    cudaStreamSynchronize(stream);
+    auto nQuintupletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules);
+
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nModules);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nModules);
+
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    auto module_quintupletModuleIndices_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules);
+
+    alpaka::wait(queue);
+    int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
+    int* module_quintupletModuleIndices = alpaka::getPtrNative(module_quintupletModuleIndices_buf);
+
     for(uint16_t i = 0; i<nLowerModules; i++)
     {
         if(!(nQuintupletsCPU[i] == 0 or module_quintupletModuleIndices[i] == -1))
@@ -1324,26 +1198,24 @@ void SDL::Event::addQuintupletsToEventExplicit()
             }
         }
     }
-    cms::cuda::free_host(nQuintupletsCPU);
-    cms::cuda::free_host(module_layers);
-    cms::cuda::free_host(module_subdets);
-    cms::cuda::free_host(module_quintupletModuleIndices);
 }
 
 void SDL::Event::addTripletsToEventExplicit()
 {
-    unsigned int* nTripletsCPU;
-    nTripletsCPU = (unsigned int*)cms::cuda::allocate_host(nLowerModules * sizeof(unsigned int), stream);
-    cudaMemcpyAsync(nTripletsCPU,tripletsInGPU->nTriplets,nLowerModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
+    auto nTripletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf, nLowerModules);
 
-    short* module_subdets;
-    module_subdets = (short*)cms::cuda::allocate_host(nLowerModules* sizeof(short), stream);
-    cudaMemcpyAsync(module_subdets,modulesInGPU->subdets,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-    short* module_layers;
-    module_layers = (short*)cms::cuda::allocate_host(nLowerModules * sizeof(short), stream);
-    cudaMemcpyAsync(module_layers,modulesInGPU->layers,nLowerModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
+
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    alpaka::wait(queue);
+    int* nTripletsCPU = alpaka::getPtrNative(nTripletsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
 
-    cudaStreamSynchronize(stream);
     for(uint16_t i = 0; i<nLowerModules; i++)
     {
         if(nTripletsCPU[i] != 0)
@@ -1358,10 +1230,6 @@ void SDL::Event::addTripletsToEventExplicit()
             }
         }
     }
-
-    cms::cuda::free_host(nTripletsCPU);
-    cms::cuda::free_host(module_layers);
-    cms::cuda::free_host(module_subdets);
 }
 
 unsigned int SDL::Event::getNumberOfHits()
@@ -1922,84 +1790,58 @@ SDL::trackCandidatesBuffer<alpaka::DevCpu>* SDL::Event::getTrackCandidatesInCMSS
     return trackCandidatesInCPU;
 }
 
-SDL::modules* SDL::Event::getFullModules()
+SDL::modulesBuffer<alpaka::DevCpu>* SDL::Event::getFullModules()
 {
     if(modulesInCPUFull == nullptr)
     {
-        modulesInCPUFull = new SDL::modules;
-
-        modulesInCPUFull->detIds = new unsigned int[nModules];
-        modulesInCPUFull->moduleMap = new uint16_t[40*nModules];
-        modulesInCPUFull->nConnectedModules = new uint16_t[nModules];
-        modulesInCPUFull->drdzs = new float[nModules];
-        modulesInCPUFull->slopes = new float[nModules];
-        modulesInCPUFull->nModules = new uint16_t[1];
-        modulesInCPUFull->nLowerModules = new uint16_t[1];
-        modulesInCPUFull->layers = new short[nModules];
-        modulesInCPUFull->rings = new short[nModules];
-        modulesInCPUFull->modules = new short[nModules];
-        modulesInCPUFull->rods = new short[nModules];
-        modulesInCPUFull->subdets = new short[nModules];
-        modulesInCPUFull->sides = new short[nModules];
-        modulesInCPUFull->isInverted = new bool[nModules];
-        modulesInCPUFull->isLower = new bool[nModules];
-
-        modulesInCPUFull->moduleType = new ModuleType[nModules];
-        modulesInCPUFull->moduleLayerType = new ModuleLayerType[nModules];
-        cudaMemcpyAsync(modulesInCPUFull->detIds,modulesInGPU->detIds,nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->moduleMap,modulesInGPU->moduleMap,40*nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->nConnectedModules,modulesInGPU->nConnectedModules,nModules*sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->drdzs,modulesInGPU->drdzs,sizeof(float)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->slopes,modulesInGPU->slopes,sizeof(float)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->nLowerModules,modulesInGPU->nLowerModules,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->layers,modulesInGPU->layers,nModules*sizeof(short),cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->rings,modulesInGPU->rings,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->modules,modulesInGPU->modules,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->rods,modulesInGPU->rods,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->subdets,modulesInGPU->subdets,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->sides,modulesInGPU->sides,sizeof(short)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->isInverted,modulesInGPU->isInverted,sizeof(bool)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->isLower,modulesInGPU->isLower,sizeof(bool)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->moduleType,modulesInGPU->moduleType,sizeof(ModuleType)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPUFull->moduleLayerType,modulesInGPU->moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        // The last input here is just a small placeholder for the allocation.
+        modulesInCPUFull = new SDL::modulesBuffer<alpaka::DevCpu>(devHost, nModules, 1);
+        modulesInCPUFull->setData(*modulesInCPUFull);
+
+        alpaka::memcpy(queue, modulesInCPUFull->detIds_buf, modulesBuffers->detIds_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->moduleMap_buf, modulesBuffers->moduleMap_buf, 40 * nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->drdzs_buf, modulesBuffers->drdzs_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->slopes_buf, modulesBuffers->slopes_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->nLowerModules_buf, modulesBuffers->nLowerModules_buf, 1);
+        alpaka::memcpy(queue, modulesInCPUFull->nModules_buf, modulesBuffers->nModules_buf, 1);
+        alpaka::memcpy(queue, modulesInCPUFull->layers_buf, modulesBuffers->layers_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->rings_buf, modulesBuffers->rings_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->modules_buf, modulesBuffers->modules_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->rods_buf, modulesBuffers->rods_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->subdets_buf, modulesBuffers->subdets_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->sides_buf, modulesBuffers->sides_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->isInverted_buf, modulesBuffers->isInverted_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->isLower_buf, modulesBuffers->isLower_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->moduleType_buf, modulesBuffers->moduleType_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPUFull->moduleLayerType_buf, modulesBuffers->moduleLayerType_buf, nModules);
+        alpaka::wait(queue);
     }
     return modulesInCPUFull;
 }
 
-SDL::modules* SDL::Event::getModules()
+SDL::modulesBuffer<alpaka::DevCpu>* SDL::Event::getModules()
 {
     if(modulesInCPU == nullptr)
     {
-        modulesInCPU = new SDL::modules;
-        modulesInCPU->nLowerModules = new uint16_t[1];
-        modulesInCPU->nModules = new uint16_t[1];
-        modulesInCPU->detIds = new unsigned int[nModules];
-        modulesInCPU->isLower = new bool[nModules];
-        modulesInCPU->layers = new short[nModules];
-        modulesInCPU->subdets = new short[nModules];
-        modulesInCPU->rings = new short[nModules];
-        modulesInCPU->rods = new short[nModules];
-        modulesInCPU->modules = new short[nModules];
-        modulesInCPU->sides = new short[nModules];
-        modulesInCPU->eta = new float[nModules];
-        modulesInCPU->r = new float[nModules];
-        modulesInCPU->moduleType = new ModuleType[nModules];
-
-        cudaMemcpyAsync(modulesInCPU->nLowerModules, modulesInGPU->nLowerModules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->nModules, modulesInGPU->nModules, sizeof(uint16_t), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->detIds, modulesInGPU->detIds, nModules * sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->isLower, modulesInGPU->isLower, nModules * sizeof(bool), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->layers, modulesInGPU->layers, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->subdets, modulesInGPU->subdets, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->rings, modulesInGPU->rings, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->rods, modulesInGPU->rods, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->modules, modulesInGPU->modules, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->sides, modulesInGPU->sides, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->eta, modulesInGPU->eta, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->r, modulesInGPU->r, nModules * sizeof(short), cudaMemcpyDeviceToHost,stream);
-        cudaMemcpyAsync(modulesInCPU->moduleType, modulesInGPU->moduleType, nModules * sizeof(ModuleType), cudaMemcpyDeviceToHost, stream);
-        cudaStreamSynchronize(stream);
+        // The last input here is just a small placeholder for the allocation.
+        modulesInCPU = new SDL::modulesBuffer<alpaka::DevCpu>(devHost, nModules, 1);
+        modulesInCPU->setData(*modulesInCPU);
+
+        alpaka::memcpy(queue, modulesInCPU->nLowerModules_buf, modulesBuffers->nLowerModules_buf, 1);
+        alpaka::memcpy(queue, modulesInCPU->nModules_buf, modulesBuffers->nModules_buf, 1);
+        alpaka::memcpy(queue, modulesInCPU->detIds_buf, modulesBuffers->detIds_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->isLower_buf, modulesBuffers->isLower_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->layers_buf, modulesBuffers->layers_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->subdets_buf, modulesBuffers->subdets_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->rings_buf, modulesBuffers->rings_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->rods_buf, modulesBuffers->rods_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->modules_buf, modulesBuffers->modules_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->sides_buf, modulesBuffers->sides_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->eta_buf, modulesBuffers->eta_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->r_buf, modulesBuffers->r_buf, nModules);
+        alpaka::memcpy(queue, modulesInCPU->moduleType_buf, modulesBuffers->moduleType_buf, nModules);
+        alpaka::wait(queue);
     }
     return modulesInCPU;
-}
+}
\ No newline at end of file
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index f5a671bf..9b70014f 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -12,8 +12,6 @@
 #include "TrackCandidate.cuh"
 #include "Constants.cuh"
 
-#include "allocate.h"
-
 namespace SDL
 {
     class Event
@@ -37,7 +35,6 @@ namespace SDL
         std::array<unsigned int, 5> n_quintuplets_by_layer_endcap_;
 
         //Device stuff
-        int dev;
         int nTotalSegments;
         struct objectRanges* rangesInGPU;
         struct objectRangesBuffer<Acc>* rangesBuffers;
@@ -65,8 +62,8 @@ namespace SDL
         segmentsBuffer<alpaka::DevCpu>* segmentsInCPU;
         tripletsBuffer<alpaka::DevCpu>* tripletsInCPU;
         trackCandidatesBuffer<alpaka::DevCpu>* trackCandidatesInCPU;
-        modules* modulesInCPU;
-        modules* modulesInCPUFull;
+        modulesBuffer<alpaka::DevCpu>* modulesInCPU;
+        modulesBuffer<alpaka::DevCpu>* modulesInCPUFull;
         quintupletsBuffer<alpaka::DevCpu>* quintupletsInCPU;
         pixelTripletsBuffer<alpaka::DevCpu>* pixelTripletsInCPU;
         pixelQuintupletsBuffer<alpaka::DevCpu>* pixelQuintupletsInCPU;
@@ -75,7 +72,6 @@ namespace SDL
         int8_t* pixelTypeCPU;
     public:
         Event(cudaStream_t estream,bool verbose);
-        ~Event();
         void resetEvent();
 
         void addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple); //call the appropriate hit function, then increment the counter here
@@ -149,18 +145,18 @@ namespace SDL
         trackCandidatesBuffer<alpaka::DevCpu>* getTrackCandidatesInCMSSW();
         pixelTripletsBuffer<alpaka::DevCpu>* getPixelTriplets();
         pixelQuintupletsBuffer<alpaka::DevCpu>* getPixelQuintuplets();
-        modules* getModules();
-        modules* getFullModules();
+        modulesBuffer<alpaka::DevCpu>* getModules();
+        modulesBuffer<alpaka::DevCpu>* getFullModules();
     };
 
     //global stuff
-    extern struct modules* modulesInGPU;
-    extern struct modules* modulesInHost;
+    extern std::shared_ptr<SDL::modules> modulesInGPU;
+    extern std::shared_ptr<SDL::modulesBuffer<Acc>> modulesBuffers;
     extern uint16_t nModules;
     extern uint16_t nLowerModules;
     void initModules(const char* moduleMetaDataFilePath="data/centroid.txt"); //read from file and init
     void cleanModules();
     void initModulesHost(); //read from file and init
-    extern std::unique_ptr<SDL::pixelMap> pixelMapping;
+    extern std::shared_ptr<SDL::pixelMap> pixelMapping;
 }
 #endif
diff --git a/SDL/Module.cu b/SDL/Module.cu
index 649995ba..259e8b8a 100644
--- a/SDL/Module.cu
+++ b/SDL/Module.cu
@@ -1,564 +1,9 @@
 #include "Module.cuh"
 
+// TODO: Change this to remove it from global scope.
 std::map <unsigned int, uint16_t> *SDL::detIdToIndex;
 std::map <unsigned int, float> *SDL::module_x;
 std::map <unsigned int, float> *SDL::module_y;
 std::map <unsigned int, float> *SDL::module_z;
 std::map <unsigned int, unsigned int> *SDL::module_type; // 23 : Ph2PSP, 24 : Ph2PSS, 25 : Ph2SS
-// https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29
-
-void SDL::createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream)
-{
-    /* modules stucture object will be created in Event.cu*/
-    cudaMalloc(&(modulesInGPU.detIds),nModules * sizeof(unsigned int));
-    cudaMalloc(&modulesInGPU.moduleMap,nModules * 40 * sizeof(uint16_t));
-    cudaMalloc(&modulesInGPU.mapIdx, nModules*sizeof(uint16_t));
-    cudaMalloc(&modulesInGPU.mapdetId, nModules*sizeof(unsigned int));
-    cudaMalloc(&modulesInGPU.nConnectedModules,nModules * sizeof(uint16_t));
-    cudaMalloc(&modulesInGPU.drdzs,nModules * sizeof(float));
-    cudaMalloc(&modulesInGPU.slopes,nModules * sizeof(float));
-    cudaMalloc(&modulesInGPU.nModules,sizeof(uint16_t));
-    cudaMalloc(&modulesInGPU.nLowerModules,sizeof(uint16_t));
-    cudaMalloc(&modulesInGPU.partnerModuleIndices, nModules * sizeof(uint16_t));
-
-    cudaMalloc(&modulesInGPU.layers,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.rings,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.modules,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.rods,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.subdets,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.sides,nModules * sizeof(short));
-    cudaMalloc(&modulesInGPU.eta,nModules * sizeof(float));
-    cudaMalloc(&modulesInGPU.r,nModules * sizeof(float));
-    cudaMalloc(&modulesInGPU.isInverted, nModules * sizeof(bool));
-    cudaMalloc(&modulesInGPU.isLower, nModules * sizeof(bool));
-    cudaMalloc(&modulesInGPU.isAnchor, nModules * sizeof(bool));
-    cudaMalloc(&modulesInGPU.moduleType,nModules * sizeof(ModuleType));
-    cudaMalloc(&modulesInGPU.moduleLayerType,nModules * sizeof(ModuleLayerType));
-
-    cudaMemcpyAsync(modulesInGPU.nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
-}
-
-void SDL::freeModules(struct modules& modulesInGPU)
-{
-    cudaFree(modulesInGPU.detIds);
-    cudaFree(modulesInGPU.moduleMap);
-    cudaFree(modulesInGPU.mapIdx);
-    cudaFree(modulesInGPU.mapdetId);
-    cudaFree(modulesInGPU.nConnectedModules);
-    cudaFree(modulesInGPU.drdzs);
-    cudaFree(modulesInGPU.slopes);
-    cudaFree(modulesInGPU.nModules);
-    cudaFree(modulesInGPU.nLowerModules);
-    cudaFree(modulesInGPU.layers);
-    cudaFree(modulesInGPU.rings);
-    cudaFree(modulesInGPU.modules);
-    cudaFree(modulesInGPU.rods);
-    cudaFree(modulesInGPU.subdets);
-    cudaFree(modulesInGPU.sides);
-    cudaFree(modulesInGPU.eta);
-    cudaFree(modulesInGPU.r);
-    cudaFree(modulesInGPU.isInverted);
-    cudaFree(modulesInGPU.isLower);
-    cudaFree(modulesInGPU.isAnchor);
-    cudaFree(modulesInGPU.moduleType);
-    cudaFree(modulesInGPU.moduleLayerType);
-    cudaFree(modulesInGPU.connectedPixels);
-    cudaFree(modulesInGPU.partnerModuleIndices);
-}
-
-void SDL::loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules, uint16_t& nLowerModules, struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath)
-{
-    detIdToIndex = new std::map<unsigned int, uint16_t>;
-    module_x = new std::map<unsigned int, float>;
-    module_y = new std::map<unsigned int, float>;
-    module_z = new std::map<unsigned int, float>;
-    module_type = new std::map<unsigned int, unsigned int>;
-
-    /*modules structure object will be created in Event.cu*/
-    /* Load the whole text file into the map first*/
-
-    std::ifstream ifile;
-    ifile.open(moduleMetaDataFilePath);
-    if(!ifile.is_open())
-    {
-        std::cout<<"ERROR! module list file not present!"<<std::endl;
-    }
-    std::string line;
-    uint16_t counter = 0;
-
-    while(std::getline(ifile,line))
-    {
-        std::stringstream ss(line);
-        std::string token;
-        int count_number = 0;
-
-        unsigned int temp_detId;
-        while(std::getline(ss,token,','))
-        {
-            if(count_number == 0)
-            {
-                temp_detId = stoi(token);
-                (*detIdToIndex)[temp_detId] = counter;
-            }
-            if(count_number == 1)
-                (*module_x)[temp_detId] = std::stof(token);
-            if(count_number == 2)
-                (*module_y)[temp_detId] = std::stof(token);
-            if(count_number == 3)
-                (*module_z)[temp_detId] = std::stof(token);
-            if(count_number == 4)
-            {
-                (*module_type)[temp_detId] = std::stoi(token);
-                counter++;
-            }
-            count_number++;
-            if(count_number>4)
-                break;
-        }
-
-    }
-    (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list
-    counter++;
-    nModules = counter;
-    //std::cout<<"Number of modules = "<<nModules<<std::endl;
-    createModulesInExplicitMemory(modulesInGPU,nModules,stream);
-
-    auto detIds_buf = allocBufWrapper<unsigned int>(devHost, nModules);
-    auto layers_buf = allocBufWrapper<short>(devHost, nModules);
-    auto rings_buf = allocBufWrapper<short>(devHost, nModules);
-    auto rods_buf = allocBufWrapper<short>(devHost, nModules);
-    auto modules_buf = allocBufWrapper<short>(devHost, nModules);
-    auto subdets_buf = allocBufWrapper<short>(devHost, nModules);
-    auto sides_buf = allocBufWrapper<short>(devHost, nModules);
-    auto eta_buf = allocBufWrapper<float>(devHost, nModules);
-    auto r_buf = allocBufWrapper<float>(devHost, nModules);
-    auto isInverted_buf = allocBufWrapper<bool>(devHost, nModules);
-    auto isLower_buf = allocBufWrapper<bool>(devHost, nModules);
-    auto isAnchor_buf = allocBufWrapper<bool>(devHost, nModules);
-    auto moduleType_buf = allocBufWrapper<ModuleType>(devHost, nModules);
-    auto moduleLayerType_buf = allocBufWrapper<ModuleLayerType>(devHost, nModules);
-    auto slopes_buf = allocBufWrapper<float>(devHost, nModules);
-    auto drdzs_buf = allocBufWrapper<float>(devHost, nModules);
-    auto partnerModuleIndices_buf = allocBufWrapper<uint16_t>(devHost, nModules);
-
-    // Getting the underlying data pointers
-    unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf);
-    short* host_layers = alpaka::getPtrNative(layers_buf);
-    short* host_rings = alpaka::getPtrNative(rings_buf);
-    short* host_rods = alpaka::getPtrNative(rods_buf);
-    short* host_modules = alpaka::getPtrNative(modules_buf);
-    short* host_subdets = alpaka::getPtrNative(subdets_buf);
-    short* host_sides = alpaka::getPtrNative(sides_buf);
-    float* host_eta = alpaka::getPtrNative(eta_buf);
-    float* host_r = alpaka::getPtrNative(r_buf);
-    bool* host_isInverted = alpaka::getPtrNative(isInverted_buf);
-    bool* host_isLower = alpaka::getPtrNative(isLower_buf);
-    bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf);
-    ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf);
-    ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf);
-    float* host_slopes = alpaka::getPtrNative(slopes_buf);
-    float* host_drdzs = alpaka::getPtrNative(drdzs_buf);
-    uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf);
-    
-    //reassign detIdToIndex indices here
-    nLowerModules = (nModules - 1) / 2;
-    uint16_t lowerModuleCounter = 0;
-    uint16_t upperModuleCounter = nLowerModules + 1;
-    //0 to nLowerModules - 1 => only lower modules, nLowerModules - pixel module, nLowerModules + 1 to nModules => upper modules
-    for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++)
-    {
-        unsigned int detId = it->first;
-        float m_x = (*module_x)[detId];
-        float m_y = (*module_y)[detId];
-        float m_z = (*module_z)[detId];
-        unsigned int m_t = (*module_type)[detId];
-
-        float eta,r;
-
-        uint16_t index;
-        unsigned short layer,ring,rod,module,subdet,side;
-        bool isInverted, isLower;
-        if(detId == 1)
-        {
-            layer = 0;
-            ring = 0;
-            rod = 0;
-            module = 0;
-            subdet = 0;
-            side = 0;
-            isInverted = false;
-            isLower = false;
-        }
-        else
-        {
-            setDerivedQuantities(detId,layer,ring,rod,module,subdet,side,m_x,m_y,m_z,eta,r);
-            isInverted = modulesInGPU.parseIsInverted(subdet, side, module, layer);
-            isLower = modulesInGPU.parseIsLower(isInverted, detId);
-        }
-        if(isLower)
-        {
-            index = lowerModuleCounter;
-            lowerModuleCounter++;
-        }
-        else if(detId != 1)
-        {
-            index = upperModuleCounter;
-            upperModuleCounter++;
-        }
-        else
-        {
-            index = nLowerModules; //pixel
-        }
-        //reassigning indices!
-        (*detIdToIndex)[detId] = index;   
-        host_detIds[index] = detId;
-        host_layers[index] = layer;
-        host_rings[index] = ring;
-        host_rods[index] = rod;
-        host_modules[index] = module;
-        host_subdets[index] = subdet;
-        host_sides[index] = side;
-        host_eta[index] = eta;
-        host_r[index] = r;
-        host_isInverted[index] = isInverted;
-        host_isLower[index] = isLower;
-
-        //assigning other variables!
-        if(detId == 1)
-        {
-            host_moduleType[index] = PixelModule;
-            host_moduleLayerType[index] = SDL::InnerPixelLayer;
-            host_slopes[index] = 0;
-            host_drdzs[index] = 0;
-            host_isAnchor[index] = false;
-        }
-        else
-        {
-            host_moduleType[index] = ( m_t == 25 ? SDL::TwoS : SDL::PS );
-            host_moduleLayerType[index] = ( m_t == 23 ? SDL::Pixel : SDL::Strip );
-
-            if(host_moduleType[index] == SDL::PS and host_moduleLayerType[index] == SDL::Pixel)
-            {
-                host_isAnchor[index] = true;
-            }
-            else if(host_moduleType[index] == SDL::TwoS and host_isLower[index])
-            {
-                host_isAnchor[index] = true;   
-            }
-            else
-            {
-                host_isAnchor[index] = false;
-            }
-
-            host_slopes[index] = (subdet == Endcap) ? endcapGeometry.getSlopeLower(detId) : tiltedGeometry.getSlope(detId);
-            host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry.getDrDz(detId) : 0;
-        }
-    }
-
-    //partner module stuff, and slopes and drdz move around
-    for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++)
-    {
-        auto& detId = it->first;
-        auto& index = it->second;
-        if(detId != 1)
-        {
-            host_partnerModuleIndices[index] = (*detIdToIndex)[modulesInGPU.parsePartnerModuleId(detId, host_isLower[index], host_isInverted[index])];
-            //add drdz and slope importing stuff here!
-            if(host_drdzs[index] == 0)
-            {
-                host_drdzs[index] = host_drdzs[host_partnerModuleIndices[index]];
-            }
-            if(host_slopes[index] == 0)
-            {
-                host_slopes[index] = host_slopes[host_partnerModuleIndices[index]];
-            }
-        }
-    }
-
-    cudaMemcpyAsync(modulesInGPU.nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.detIds,host_detIds,nModules*sizeof(unsigned int),cudaMemcpyHostToDevice,stream);
-
-    cudaMemcpyAsync(modulesInGPU.layers,host_layers,nModules*sizeof(short),cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.rings,host_rings,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.rods,host_rods,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.modules,host_modules,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.subdets,host_subdets,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.sides,host_sides,sizeof(short)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.eta,host_eta,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.r,host_r,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.isInverted,host_isInverted,sizeof(bool)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.isLower,host_isLower,sizeof(bool)*nModules,cudaMemcpyHostToDevice,stream);
-
-    cudaMemcpyAsync(modulesInGPU.moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.slopes,host_slopes,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.isAnchor, host_isAnchor, sizeof(bool) * nModules, cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(modulesInGPU.drdzs,host_drdzs,sizeof(float)*nModules,cudaMemcpyHostToDevice,stream);
-
-    cudaMemcpyAsync(modulesInGPU.partnerModuleIndices, host_partnerModuleIndices, sizeof(uint16_t) * nModules, cudaMemcpyHostToDevice, stream);
-    cudaStreamSynchronize(stream);
-
-    fillConnectedModuleArrayExplicit(modulesInGPU,nModules,stream);
-    fillMapArraysExplicit(modulesInGPU, nModules, stream);
-    fillPixelMap(modulesInGPU,pixelMapping,stream);
-}
-
-void SDL::fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules)
-{
-    for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it)
-    {
-        unsigned int detId = it->first;
-        uint16_t index = it->second;
-        auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId);
-        modulesInGPU.nConnectedModules[index] = connectedModules.size();
-        for(uint16_t i = 0; i< modulesInGPU.nConnectedModules[index];i++)
-        {
-            modulesInGPU.moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]];
-        }
-    }
-}
-
-void SDL::fillPixelMap(struct modules& modulesInGPU, struct pixelMap& pixelMapping,cudaStream_t stream)
-{
-    std::vector<unsigned int> connectedModuleDetIds;
-    std::vector<unsigned int> connectedModuleDetIds_pos;
-    std::vector<unsigned int> connectedModuleDetIds_neg;
-
-    int totalSizes = 0;
-    int totalSizes_pos = 0;
-    int totalSizes_neg = 0;
-    for(unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++)
-    {
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5 = SDL::moduleConnectionMap_pLStoLayer3Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4 = SDL::moduleConnectionMap_pLStoLayer1Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4 = SDL::moduleConnectionMap_pLStoLayer2Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4 = SDL::moduleConnectionMap_pLStoLayer3Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4 = SDL::moduleConnectionMap_pLStoLayer4Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet5.begin(),connectedModuleDetIds_pLStoLayer1Subdet5.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet5.begin(),connectedModuleDetIds_pLStoLayer2Subdet5.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet5.begin(),connectedModuleDetIds_pLStoLayer3Subdet5.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet4.begin(),connectedModuleDetIds_pLStoLayer1Subdet4.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet4.begin(),connectedModuleDetIds_pLStoLayer2Subdet4.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end());
-        connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end());
-
-        int sizes = 0;
-        sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size();
-        sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size();
-        sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size();
-        sizes += connectedModuleDetIds_pLStoLayer1Subdet4.size();
-        sizes += connectedModuleDetIds_pLStoLayer2Subdet4.size();
-        sizes += connectedModuleDetIds_pLStoLayer3Subdet4.size();
-        sizes += connectedModuleDetIds_pLStoLayer4Subdet4.size();
-        pixelMapping.connectedPixelsIndex[isuperbin] = totalSizes;
-        pixelMapping.connectedPixelsSizes[isuperbin] = sizes;
-        totalSizes += sizes;
-
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet5_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet5_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet5_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet4_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet4_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet4_pos.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer4Subdet4_pos.getConnectedModuleDetIds(isuperbin);
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end());
-        connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end());
-
-        int sizes_pos = 0;
-        sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet4_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet4_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet4_pos.size();
-        sizes_pos += connectedModuleDetIds_pLStoLayer4Subdet4_pos.size();
-        pixelMapping.connectedPixelsIndexPos[isuperbin] = totalSizes_pos;
-        pixelMapping.connectedPixelsSizesPos[isuperbin] = sizes_pos;
-        totalSizes_pos += sizes_pos;
-
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet5_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet5_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet5_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet4_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet4_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet4_neg.getConnectedModuleDetIds(isuperbin);
-        std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer4Subdet4_neg.getConnectedModuleDetIds(isuperbin);
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.end());
-        connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.end());
-
-        int sizes_neg = 0;
-        sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet5_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet5_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet5_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet4_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet4_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet4_neg.size();
-        sizes_neg += connectedModuleDetIds_pLStoLayer4Subdet4_neg.size();
-        pixelMapping.connectedPixelsIndexNeg[isuperbin] = totalSizes_neg;
-        pixelMapping.connectedPixelsSizesNeg[isuperbin] = sizes_neg;
-        totalSizes_neg += sizes_neg;
-    }
-
-    unsigned int* connectedPixels;
-    connectedPixels = (unsigned int*)cms::cuda::allocate_host((totalSizes+totalSizes_pos+totalSizes_neg) * sizeof(unsigned int), stream);
-    cudaMalloc(&modulesInGPU.connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)* sizeof(unsigned int));
-
-    for(int icondet = 0; icondet < totalSizes; icondet++)
-    {
-        connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]];
-    }
-    for(int icondet = 0; icondet < totalSizes_pos; icondet++)
-    {
-        connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]];
-    }
-    for(int icondet = 0; icondet < totalSizes_neg; icondet++)
-    {
-        connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]];
-    }
-    cudaMemcpyAsync(modulesInGPU.connectedPixels,connectedPixels,(totalSizes+totalSizes_pos+totalSizes_neg)*sizeof(unsigned int),cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
-
-    cms::cuda::free_host(connectedPixels);
-}
-
-void SDL::fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream)
-{
-    uint16_t* moduleMap;
-    uint16_t* nConnectedModules;
-    moduleMap = (uint16_t*)cms::cuda::allocate_host(nModules * 40 * sizeof(uint16_t), stream);
-    nConnectedModules = (uint16_t*)cms::cuda::allocate_host(nModules * sizeof(uint16_t), stream);
-    for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it)
-    {
-        unsigned int detId = it->first;
-        uint16_t index = it->second;
-        auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId);
-        nConnectedModules[index] = connectedModules.size();
-        for(uint16_t i = 0; i< nConnectedModules[index];i++)
-        {
-            moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]];
-        }
-    }
-    cudaMemcpyAsync(modulesInGPU.moduleMap,moduleMap,nModules*40*sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.nConnectedModules,nConnectedModules,nModules*sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
-    cms::cuda::free_host(moduleMap);
-    cms::cuda::free_host(nConnectedModules);
-}
-
-void SDL::fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream)
-{
-    uint16_t* mapIdx;
-    unsigned int* mapdetId;
-    unsigned int counter = 0;
-    mapIdx = (uint16_t*)cms::cuda::allocate_host(nModules * sizeof(uint16_t), stream);
-    mapdetId = (unsigned int*)cms::cuda::allocate_host(nModules * sizeof(unsigned int), stream);
-    for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it)
-    {
-        unsigned int detId = it->first;
-        unsigned int index = it->second;
-        mapIdx[counter] = index;
-        mapdetId[counter] = detId;
-        counter++;
-    }
-    cudaMemcpyAsync(modulesInGPU.mapIdx,mapIdx,nModules*sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-    cudaMemcpyAsync(modulesInGPU.mapdetId,mapdetId,nModules*sizeof(unsigned int),cudaMemcpyHostToDevice,stream);
-    cudaStreamSynchronize(stream);
-    cms::cuda::free_host(mapIdx);
-    cms::cuda::free_host(mapdetId);
-}
-
-void SDL::setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r)
-{
-    subdet = (detId & (7 << 25)) >> 25;
-    side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18;
-    layer = (subdet == Endcap) ? (detId & (7 << 18)) >> 18 : (detId & (7 << 20)) >> 20;
-    ring = (subdet == Endcap) ? (detId & (15 << 12)) >> 12 : 0;
-    module = (detId & (127 << 2)) >> 2;
-    rod = (subdet == Endcap) ? 0 : (detId & (127 << 10)) >> 10;
-
-    r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z);
-    eta = ((m_z > 0) - ( m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y));
-}
-
-bool SDL::modules::parseIsInverted(short subdet, short side, short module, short layer)
-{
-    if (subdet == Endcap)
-    {
-        if (side == NegZ)
-        {
-            return module % 2 == 1;
-        }
-        else if (side == PosZ)
-        {
-            return module % 2 == 0;
-        }
-        else
-        {
-            return 0;
-        }
-    }
-    else if (subdet == Barrel)
-    {
-        if (side == Center)
-        {
-            if (layer <= 3)
-            {
-                return module % 2 == 1;
-            }
-            else if (layer >= 4)
-            {
-                return module % 2 == 0;
-            }
-            else
-            {
-                return 0;
-            }
-        }
-        else if (side == NegZ or side == PosZ)
-        {
-            if (layer <= 2)
-            {
-                return module % 2 == 1;
-            }
-            else if (layer == 3)
-            {
-                return module % 2 == 0;
-            }
-            else
-            {
-                return 0;
-            }
-        }
-        else
-        {
-            return 0;
-        }
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-bool SDL::modules::parseIsLower(bool isInvertedx, unsigned int detId)
-{
-    return (isInvertedx) ? !(detId & 1) : (detId & 1);
-}
-
-unsigned int SDL::modules::parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx)
-{
-    return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1);
-}
+// https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29
\ No newline at end of file
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index 3967c764..1015031c 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -8,7 +8,6 @@
 #include "TiltedGeometry.h"
 #include "EndcapGeometry.cuh"
 #include "ModuleConnectionMap.h"
-#include "allocate.h"
 
 namespace SDL
 {
@@ -40,6 +39,12 @@ namespace SDL
         InnerPixelLayer
     };
 
+    extern std::map <unsigned int, uint16_t>* detIdToIndex;
+    extern std::map <unsigned int, float> *module_x;
+    extern std::map <unsigned int, float> *module_y;
+    extern std::map <unsigned int, float> *module_z;
+    extern std::map <unsigned int, unsigned int> *module_type;
+
     struct objectRanges
     {
         int* hitRanges;
@@ -141,48 +146,48 @@ namespace SDL
         Buf<TAcc, unsigned int> device_nTotalQuints_buf;
 
         template<typename TQueue, typename TDevAcc>
-        objectRangesBuffer(unsigned int nModules,
-                           unsigned int nLowerModules,
+        objectRangesBuffer(unsigned int nMod,
+                           unsigned int nLowerMod,
                            TDevAcc const & devAccIn,
                            TQueue& queue) :
-            hitRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nModules)),
-            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nModules)),
-            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
-            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
-            mdRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            segmentRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            trackletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            tripletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            trackCandidateRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            quintupletRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
+            hitRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nMod)),
+            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nMod)),
+            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nMod)),
+            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nMod)),
+            mdRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            segmentRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            trackletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            tripletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            trackCandidateRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
+            quintupletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
             nEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, 1)),
-            indicesOfEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, nLowerModules)),
-            quintupletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            quintupletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            miniDoubletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            miniDoubletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            segmentModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            segmentModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            tripletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            tripletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
+            indicesOfEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, nLowerMod)),
+            quintupletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
+            quintupletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
+            miniDoubletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
+            miniDoubletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
+            segmentModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
+            segmentModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
+            tripletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
+            tripletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
             device_nTotalMDs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
             device_nTotalSegs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
             device_nTotalTrips_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
             device_nTotalQuints_buf(allocBufWrapper<unsigned int>(devAccIn, 1))
         {
-            alpaka::memset(queue, hitRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, hitRangesLower_buf, -1, nModules);
-            alpaka::memset(queue, hitRangesUpper_buf, -1, nModules);
-            alpaka::memset(queue, hitRangesnLower_buf, -1, nModules);
-            alpaka::memset(queue, hitRangesnUpper_buf, -1, nModules);
-            alpaka::memset(queue, mdRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, segmentRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, trackletRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, tripletRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, trackCandidateRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, quintupletRanges_buf, -1, nModules*2);
-            alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerModules);
+            alpaka::memset(queue, hitRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, hitRangesLower_buf, -1, nMod);
+            alpaka::memset(queue, hitRangesUpper_buf, -1, nMod);
+            alpaka::memset(queue, hitRangesnLower_buf, -1, nMod);
+            alpaka::memset(queue, hitRangesnUpper_buf, -1, nMod);
+            alpaka::memset(queue, mdRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, segmentRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, trackletRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, tripletRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, trackCandidateRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, quintupletRanges_buf, -1, nMod*2);
+            alpaka::memset(queue, quintupletModuleIndices_buf, -1, nLowerMod);
             alpaka::wait(queue);
         }
     };
@@ -214,20 +219,171 @@ namespace SDL
         ModuleType* moduleType;
         ModuleLayerType* moduleLayerType;
 
-        unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx);
+        unsigned int* connectedPixels;
 
-        bool parseIsInverted(short subdet, short side, short module, short layer);
-        bool parseIsLower(bool isInvertedx,unsigned int detId);
+        bool parseIsInverted(short subdet, short side, short module, short layer)
+        {
+            if (subdet == Endcap)
+            {
+                if (side == NegZ)
+                {
+                    return module % 2 == 1;
+                }
+                else if (side == PosZ)
+                {
+                    return module % 2 == 0;
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+            else if (subdet == Barrel)
+            {
+                if (side == Center)
+                {
+                    if (layer <= 3)
+                    {
+                        return module % 2 == 1;
+                    }
+                    else if (layer >= 4)
+                    {
+                        return module % 2 == 0;
+                    }
+                    else
+                    {
+                        return 0;
+                    }
+                }
+                else if (side == NegZ or side == PosZ)
+                {
+                    if (layer <= 2)
+                    {
+                        return module % 2 == 1;
+                    }
+                    else if (layer == 3)
+                    {
+                        return module % 2 == 0;
+                    }
+                    else
+                    {
+                        return 0;
+                    }
+                }
+                else
+                {
+                    return 0;
+                }
+            }
+            else
+            {
+                return 0;
+            }
+        };
 
-        unsigned int* connectedPixels;
-        unsigned int* connectedPixelsIndex;
-        unsigned int* connectedPixelsSizes;
-        unsigned int* connectedPixelsPos;
-        unsigned int* connectedPixelsIndexPos;
-        unsigned int* connectedPixelsSizesPos;
-        unsigned int* connectedPixelsNeg;
-        unsigned int* connectedPixelsIndexNeg;
-        unsigned int* connectedPixelsSizesNeg;
+        bool parseIsLower(bool isInvertedx, unsigned int detId)
+        {
+            return (isInvertedx) ? !(detId & 1) : (detId & 1);
+        };
+
+        unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx)
+        {
+            return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1);
+        };
+
+        template<typename TBuff>
+        void setData(TBuff& modulesbuf)
+        {
+            detIds = alpaka::getPtrNative(modulesbuf.detIds_buf);
+            moduleMap = alpaka::getPtrNative(modulesbuf.moduleMap_buf);
+            mapdetId = alpaka::getPtrNative(modulesbuf.mapdetId_buf);
+            mapIdx = alpaka::getPtrNative(modulesbuf.mapIdx_buf);
+            nConnectedModules = alpaka::getPtrNative(modulesbuf.nConnectedModules_buf);
+            drdzs = alpaka::getPtrNative(modulesbuf.drdzs_buf);
+            slopes = alpaka::getPtrNative(modulesbuf.slopes_buf);
+            nModules = alpaka::getPtrNative(modulesbuf.nModules_buf);
+            nLowerModules = alpaka::getPtrNative(modulesbuf.nLowerModules_buf);
+            partnerModuleIndices = alpaka::getPtrNative(modulesbuf.partnerModuleIndices_buf);
+
+            layers = alpaka::getPtrNative(modulesbuf.layers_buf);
+            rings = alpaka::getPtrNative(modulesbuf.rings_buf);
+            modules = alpaka::getPtrNative(modulesbuf.modules_buf);
+            rods = alpaka::getPtrNative(modulesbuf.rods_buf);
+            subdets = alpaka::getPtrNative(modulesbuf.subdets_buf);
+            sides = alpaka::getPtrNative(modulesbuf.sides_buf);
+            eta = alpaka::getPtrNative(modulesbuf.eta_buf);
+            r = alpaka::getPtrNative(modulesbuf.r_buf);
+            isInverted = alpaka::getPtrNative(modulesbuf.isInverted_buf);
+            isLower = alpaka::getPtrNative(modulesbuf.isLower_buf);
+            isAnchor = alpaka::getPtrNative(modulesbuf.isAnchor_buf);
+            moduleType = alpaka::getPtrNative(modulesbuf.moduleType_buf);
+            moduleLayerType = alpaka::getPtrNative(modulesbuf.moduleLayerType_buf);
+
+            connectedPixels = alpaka::getPtrNative(modulesbuf.connectedPixels_buf);
+        }
+    };
+
+    template<typename TAcc>
+    struct modulesBuffer : modules
+    {
+        Buf<TAcc, unsigned int> detIds_buf;
+        Buf<TAcc, uint16_t> moduleMap_buf;
+        Buf<TAcc, unsigned int> mapdetId_buf;
+        Buf<TAcc, uint16_t> mapIdx_buf;
+        Buf<TAcc, uint16_t> nConnectedModules_buf;
+        Buf<TAcc, float> drdzs_buf;
+        Buf<TAcc, float> slopes_buf;
+        Buf<TAcc, uint16_t> nModules_buf;
+        Buf<TAcc, uint16_t> nLowerModules_buf;
+        Buf<TAcc, uint16_t> partnerModuleIndices_buf;
+
+        Buf<TAcc, short> layers_buf;
+        Buf<TAcc, short> rings_buf;
+        Buf<TAcc, short> modules_buf;
+        Buf<TAcc, short> rods_buf;
+        Buf<TAcc, short> subdets_buf;
+        Buf<TAcc, short> sides_buf;
+        Buf<TAcc, float> eta_buf;
+        Buf<TAcc, float> r_buf;
+        Buf<TAcc, bool> isInverted_buf;
+        Buf<TAcc, bool> isLower_buf;
+        Buf<TAcc, bool> isAnchor_buf;
+        Buf<TAcc, ModuleType> moduleType_buf;
+        Buf<TAcc, ModuleLayerType> moduleLayerType_buf;
+
+        Buf<TAcc, unsigned int> connectedPixels_buf;
+
+        template<typename TDevAcc>
+        modulesBuffer(TDevAcc const & devAccIn,
+                      unsigned int nMod = modules_size,
+                      unsigned int nPixs = pix_tot) :
+            detIds_buf(allocBufWrapper<unsigned int>(devAccIn, nMod)),
+            moduleMap_buf(allocBufWrapper<uint16_t>(devAccIn, nMod * 40)),
+            mapdetId_buf(allocBufWrapper<unsigned int>(devAccIn, nMod)),
+            mapIdx_buf(allocBufWrapper<uint16_t>(devAccIn, nMod)),
+            nConnectedModules_buf(allocBufWrapper<uint16_t>(devAccIn, nMod)),
+            drdzs_buf(allocBufWrapper<float>(devAccIn, nMod)),
+            slopes_buf(allocBufWrapper<float>(devAccIn, nMod)),
+            nModules_buf(allocBufWrapper<uint16_t>(devAccIn, 1)),
+            nLowerModules_buf(allocBufWrapper<uint16_t>(devAccIn, 1)),
+            partnerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMod)),
+
+            layers_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            rings_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            modules_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            rods_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            subdets_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            sides_buf(allocBufWrapper<short>(devAccIn, nMod)),
+            eta_buf(allocBufWrapper<float>(devAccIn, nMod)),
+            r_buf(allocBufWrapper<float>(devAccIn, nMod)),
+            isInverted_buf(allocBufWrapper<bool>(devAccIn, nMod)),
+            isLower_buf(allocBufWrapper<bool>(devAccIn, nMod)),
+            isAnchor_buf(allocBufWrapper<bool>(devAccIn, nMod)),
+            moduleType_buf(allocBufWrapper<ModuleType>(devAccIn, nMod)),
+            moduleLayerType_buf(allocBufWrapper<ModuleLayerType>(devAccIn, nMod)),
+
+            connectedPixels_buf(allocBufWrapper<unsigned int>(devAccIn, nPixs))
+        {}
     };
 
     // PixelMap is never allocated on the device.
@@ -267,20 +423,428 @@ namespace SDL
         }
     };
 
-    extern std::map <unsigned int, uint16_t>* detIdToIndex;
-    extern std::map <unsigned int, float> *module_x;
-    extern std::map <unsigned int, float> *module_y;
-    extern std::map  <unsigned int, float> *module_z;
-    extern std::map  <unsigned int, unsigned int> *module_type;
-
-    void loadModulesFromFile(struct modules& modulesInGPU, uint16_t& nModules,uint16_t& nLowerModules,struct pixelMap& pixelMapping,cudaStream_t stream, const char* moduleMetaDataFilePath="data/centroid.txt");
-    void createModulesInExplicitMemory(struct modules& modulesInGPU,unsigned int nModules,cudaStream_t stream);
-    void freeModules(struct modules& modulesInGPU);
-    void fillPixelMap(struct modules& modulesInGPU,struct pixelMap& pixelMapping,cudaStream_t stream);
-    void fillConnectedModuleArrayExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
-    void fillMapArraysExplicit(struct modules& modulesInGPU, unsigned int nModules,cudaStream_t stream);
-    void fillConnectedModuleArray(struct modules& modulesInGPU, unsigned int nModules);
-    void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r);
-    void createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned int nModules,cudaStream_t stream, unsigned int nLowerModules);
+    template<typename TQueue, typename TAcc>
+    inline void fillPixelMap(struct modulesBuffer<TAcc>* modulesBuf, struct pixelMap& pixelMapping, TQueue queue)
+    {
+        std::vector<unsigned int> connectedModuleDetIds;
+        std::vector<unsigned int> connectedModuleDetIds_pos;
+        std::vector<unsigned int> connectedModuleDetIds_neg;
+
+        int totalSizes = 0;
+        int totalSizes_pos = 0;
+        int totalSizes_neg = 0;
+        for(unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++)
+        {
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5 = SDL::moduleConnectionMap_pLStoLayer1Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// index adjustment to get high values
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5 = SDL::moduleConnectionMap_pLStoLayer2Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);// from the high pt bins
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5 = SDL::moduleConnectionMap_pLStoLayer3Subdet5.getConnectedModuleDetIds(isuperbin+size_superbins);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4 = SDL::moduleConnectionMap_pLStoLayer1Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4 = SDL::moduleConnectionMap_pLStoLayer2Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4 = SDL::moduleConnectionMap_pLStoLayer3Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4 = SDL::moduleConnectionMap_pLStoLayer4Subdet4.getConnectedModuleDetIds(isuperbin+size_superbins);
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet5.begin(),connectedModuleDetIds_pLStoLayer1Subdet5.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet5.begin(),connectedModuleDetIds_pLStoLayer2Subdet5.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet5.begin(),connectedModuleDetIds_pLStoLayer3Subdet5.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer1Subdet4.begin(),connectedModuleDetIds_pLStoLayer1Subdet4.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer2Subdet4.begin(),connectedModuleDetIds_pLStoLayer2Subdet4.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer3Subdet4.begin(),connectedModuleDetIds_pLStoLayer3Subdet4.end());
+            connectedModuleDetIds.insert(connectedModuleDetIds.end(),connectedModuleDetIds_pLStoLayer4Subdet4.begin(),connectedModuleDetIds_pLStoLayer4Subdet4.end());
+
+            int sizes = 0;
+            sizes += connectedModuleDetIds_pLStoLayer1Subdet5.size();
+            sizes += connectedModuleDetIds_pLStoLayer2Subdet5.size();
+            sizes += connectedModuleDetIds_pLStoLayer3Subdet5.size();
+            sizes += connectedModuleDetIds_pLStoLayer1Subdet4.size();
+            sizes += connectedModuleDetIds_pLStoLayer2Subdet4.size();
+            sizes += connectedModuleDetIds_pLStoLayer3Subdet4.size();
+            sizes += connectedModuleDetIds_pLStoLayer4Subdet4.size();
+            pixelMapping.connectedPixelsIndex[isuperbin] = totalSizes;
+            pixelMapping.connectedPixelsSizes[isuperbin] = sizes;
+            totalSizes += sizes;
+
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet5_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet5_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet5_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer1Subdet4_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer2Subdet4_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer3Subdet4_pos.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4_pos = SDL::moduleConnectionMap_pLStoLayer4Subdet4_pos.getConnectedModuleDetIds(isuperbin);
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_pos.end());
+            connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_pos.end());
+
+            int sizes_pos = 0;
+            sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet5_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet5_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet5_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer1Subdet4_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer2Subdet4_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer3Subdet4_pos.size();
+            sizes_pos += connectedModuleDetIds_pLStoLayer4Subdet4_pos.size();
+            pixelMapping.connectedPixelsIndexPos[isuperbin] = totalSizes_pos;
+            pixelMapping.connectedPixelsSizesPos[isuperbin] = sizes_pos;
+            totalSizes_pos += sizes_pos;
+
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet5_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet5_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet5_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet5_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer1Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer1Subdet4_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer2Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer2Subdet4_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer3Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer3Subdet4_neg.getConnectedModuleDetIds(isuperbin);
+            std::vector<unsigned int> connectedModuleDetIds_pLStoLayer4Subdet4_neg = SDL::moduleConnectionMap_pLStoLayer4Subdet4_neg.getConnectedModuleDetIds(isuperbin);
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet5_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet5_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet5_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer1Subdet4_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer2Subdet4_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer3Subdet4_neg.end());
+            connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.begin(),connectedModuleDetIds_pLStoLayer4Subdet4_neg.end());
+
+            int sizes_neg = 0;
+            sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet5_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet5_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet5_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer1Subdet4_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer2Subdet4_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer3Subdet4_neg.size();
+            sizes_neg += connectedModuleDetIds_pLStoLayer4Subdet4_neg.size();
+            pixelMapping.connectedPixelsIndexNeg[isuperbin] = totalSizes_neg;
+            pixelMapping.connectedPixelsSizesNeg[isuperbin] = sizes_neg;
+            totalSizes_neg += sizes_neg;
+        }
+
+        auto connectedPixels_buf = allocBufWrapper<unsigned int>(devHost, totalSizes + totalSizes_pos + totalSizes_neg);
+        unsigned int* connectedPixels = alpaka::getPtrNative(connectedPixels_buf);
+
+        for(int icondet = 0; icondet < totalSizes; icondet++)
+        {
+            connectedPixels[icondet] = (*detIdToIndex)[connectedModuleDetIds[icondet]];
+        }
+        for(int icondet = 0; icondet < totalSizes_pos; icondet++)
+        {
+            connectedPixels[icondet+totalSizes] = (*detIdToIndex)[connectedModuleDetIds_pos[icondet]];
+        }
+        for(int icondet = 0; icondet < totalSizes_neg; icondet++)
+        {
+            connectedPixels[icondet+totalSizes+totalSizes_pos] = (*detIdToIndex)[connectedModuleDetIds_neg[icondet]];
+        }
+
+        alpaka::memcpy(queue, modulesBuf->connectedPixels_buf, connectedPixels_buf, totalSizes + totalSizes_pos + totalSizes_neg);
+        alpaka::wait(queue);
+    };
+
+    template<typename TQueue, typename TAcc>
+    inline void fillConnectedModuleArrayExplicit(struct modulesBuffer<TAcc>* modulesBuf, unsigned int nMod, TQueue queue)
+    {
+        auto moduleMap_buf = allocBufWrapper<uint16_t>(devHost, nMod * 40);
+        uint16_t* moduleMap = alpaka::getPtrNative(moduleMap_buf);
+
+        auto nConnectedModules_buf = allocBufWrapper<uint16_t>(devHost, nMod);
+        uint16_t* nConnectedModules = alpaka::getPtrNative(nConnectedModules_buf);
+
+        for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it)
+        {
+            unsigned int detId = it->first;
+            uint16_t index = it->second;
+            auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId);
+            nConnectedModules[index] = connectedModules.size();
+            for(uint16_t i = 0; i< nConnectedModules[index];i++)
+            {
+                moduleMap[index * 40 + i] = (*detIdToIndex)[connectedModules[i]];
+            }
+        }
+
+        alpaka::memcpy(queue, modulesBuf->moduleMap_buf, moduleMap_buf, nMod * 40);
+        alpaka::memcpy(queue, modulesBuf->nConnectedModules_buf, nConnectedModules_buf, nMod);
+        alpaka::wait(queue);
+    };
+
+    template<typename TQueue, typename TAcc>
+    inline void fillMapArraysExplicit(struct modulesBuffer<TAcc>* modulesBuf, unsigned int nMod, TQueue queue)
+    {
+        auto mapIdx_buf = allocBufWrapper<uint16_t>(devHost, nMod);
+        uint16_t* mapIdx = alpaka::getPtrNative(mapIdx_buf);
+
+        auto mapdetId_buf = allocBufWrapper<unsigned int>(devHost, nMod);
+        unsigned int* mapdetId = alpaka::getPtrNative(mapdetId_buf);
+
+        unsigned int counter = 0;
+        for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); ++it)
+        {
+            unsigned int detId = it->first;
+            unsigned int index = it->second;
+            mapIdx[counter] = index;
+            mapdetId[counter] = detId;
+            counter++;
+        }
+
+        alpaka::memcpy(queue, modulesBuf->mapIdx_buf, mapIdx_buf, nMod);
+        alpaka::memcpy(queue, modulesBuf->mapdetId_buf, mapdetId_buf, nMod);
+        alpaka::wait(queue);
+    };
+
+    inline void setDerivedQuantities(unsigned int detId, unsigned short& layer, unsigned short& ring, unsigned short& rod, unsigned short& module, unsigned short& subdet, unsigned short& side, float m_x, float m_y, float m_z, float& eta, float& r)
+    {
+        subdet = (detId & (7 << 25)) >> 25;
+        side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18;
+        layer = (subdet == Endcap) ? (detId & (7 << 18)) >> 18 : (detId & (7 << 20)) >> 20;
+        ring = (subdet == Endcap) ? (detId & (15 << 12)) >> 12 : 0;
+        module = (detId & (127 << 2)) >> 2;
+        rod = (subdet == Endcap) ? 0 : (detId & (127 << 10)) >> 10;
+
+        r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z);
+        eta = ((m_z > 0) - ( m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y));
+    };
+
+    template<typename TQueue, typename TAcc>
+    void loadModulesFromFile(struct modules* modulesInGPU,
+                             struct modulesBuffer<TAcc>* modulesBuf,
+                             uint16_t& nModules,
+                             uint16_t& nLowerModules,
+                             struct pixelMap& pixelMapping,
+                             cudaStream_t stream,
+                             TQueue& queue,
+                             const char* moduleMetaDataFilePath)
+    {
+        detIdToIndex = new std::map<unsigned int, uint16_t>;
+        module_x = new std::map<unsigned int, float>;
+        module_y = new std::map<unsigned int, float>;
+        module_z = new std::map<unsigned int, float>;
+        module_type = new std::map<unsigned int, unsigned int>;
+
+        /* Load the whole text file into the map first*/
+
+        std::ifstream ifile;
+        ifile.open(moduleMetaDataFilePath);
+        if(!ifile.is_open())
+        {
+            std::cout<<"ERROR! module list file not present!"<<std::endl;
+        }
+        std::string line;
+        uint16_t counter = 0;
+
+        while(std::getline(ifile,line))
+        {
+            std::stringstream ss(line);
+            std::string token;
+            int count_number = 0;
+
+            unsigned int temp_detId;
+            while(std::getline(ss,token,','))
+            {
+                if(count_number == 0)
+                {
+                    temp_detId = stoi(token);
+                    (*detIdToIndex)[temp_detId] = counter;
+                }
+                if(count_number == 1)
+                    (*module_x)[temp_detId] = std::stof(token);
+                if(count_number == 2)
+                    (*module_y)[temp_detId] = std::stof(token);
+                if(count_number == 3)
+                    (*module_z)[temp_detId] = std::stof(token);
+                if(count_number == 4)
+                {
+                    (*module_type)[temp_detId] = std::stoi(token);
+                    counter++;
+                }
+                count_number++;
+                if(count_number>4)
+                    break;
+            }
+
+        }
+        (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list
+        counter++;
+        nModules = counter;
+
+        auto detIds_buf = allocBufWrapper<unsigned int>(devHost, nModules);
+        auto layers_buf = allocBufWrapper<short>(devHost, nModules);
+        auto rings_buf = allocBufWrapper<short>(devHost, nModules);
+        auto rods_buf = allocBufWrapper<short>(devHost, nModules);
+        auto modules_buf = allocBufWrapper<short>(devHost, nModules);
+        auto subdets_buf = allocBufWrapper<short>(devHost, nModules);
+        auto sides_buf = allocBufWrapper<short>(devHost, nModules);
+        auto eta_buf = allocBufWrapper<float>(devHost, nModules);
+        auto r_buf = allocBufWrapper<float>(devHost, nModules);
+        auto isInverted_buf = allocBufWrapper<bool>(devHost, nModules);
+        auto isLower_buf = allocBufWrapper<bool>(devHost, nModules);
+        auto isAnchor_buf = allocBufWrapper<bool>(devHost, nModules);
+        auto moduleType_buf = allocBufWrapper<ModuleType>(devHost, nModules);
+        auto moduleLayerType_buf = allocBufWrapper<ModuleLayerType>(devHost, nModules);
+        auto slopes_buf = allocBufWrapper<float>(devHost, nModules);
+        auto drdzs_buf = allocBufWrapper<float>(devHost, nModules);
+        auto partnerModuleIndices_buf = allocBufWrapper<uint16_t>(devHost, nModules);
+
+        // Getting the underlying data pointers
+        unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf);
+        short* host_layers = alpaka::getPtrNative(layers_buf);
+        short* host_rings = alpaka::getPtrNative(rings_buf);
+        short* host_rods = alpaka::getPtrNative(rods_buf);
+        short* host_modules = alpaka::getPtrNative(modules_buf);
+        short* host_subdets = alpaka::getPtrNative(subdets_buf);
+        short* host_sides = alpaka::getPtrNative(sides_buf);
+        float* host_eta = alpaka::getPtrNative(eta_buf);
+        float* host_r = alpaka::getPtrNative(r_buf);
+        bool* host_isInverted = alpaka::getPtrNative(isInverted_buf);
+        bool* host_isLower = alpaka::getPtrNative(isLower_buf);
+        bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf);
+        ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf);
+        ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf);
+        float* host_slopes = alpaka::getPtrNative(slopes_buf);
+        float* host_drdzs = alpaka::getPtrNative(drdzs_buf);
+        uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf);
+        
+        //reassign detIdToIndex indices here
+        nLowerModules = (nModules - 1) / 2;
+        uint16_t lowerModuleCounter = 0;
+        uint16_t upperModuleCounter = nLowerModules + 1;
+        //0 to nLowerModules - 1 => only lower modules, nLowerModules - pixel module, nLowerModules + 1 to nModules => upper modules
+        for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++)
+        {
+            unsigned int detId = it->first;
+            float m_x = (*module_x)[detId];
+            float m_y = (*module_y)[detId];
+            float m_z = (*module_z)[detId];
+            unsigned int m_t = (*module_type)[detId];
+
+            float eta,r;
+
+            uint16_t index;
+            unsigned short layer,ring,rod,module,subdet,side;
+            bool isInverted, isLower;
+            if(detId == 1)
+            {
+                layer = 0;
+                ring = 0;
+                rod = 0;
+                module = 0;
+                subdet = 0;
+                side = 0;
+                isInverted = false;
+                isLower = false;
+            }
+            else
+            {
+                setDerivedQuantities(detId,layer,ring,rod,module,subdet,side,m_x,m_y,m_z,eta,r);
+                isInverted = modulesInGPU->parseIsInverted(subdet, side, module, layer);
+                isLower = modulesInGPU->parseIsLower(isInverted, detId);
+            }
+            if(isLower)
+            {
+                index = lowerModuleCounter;
+                lowerModuleCounter++;
+            }
+            else if(detId != 1)
+            {
+                index = upperModuleCounter;
+                upperModuleCounter++;
+            }
+            else
+            {
+                index = nLowerModules; //pixel
+            }
+            //reassigning indices!
+            (*detIdToIndex)[detId] = index;   
+            host_detIds[index] = detId;
+            host_layers[index] = layer;
+            host_rings[index] = ring;
+            host_rods[index] = rod;
+            host_modules[index] = module;
+            host_subdets[index] = subdet;
+            host_sides[index] = side;
+            host_eta[index] = eta;
+            host_r[index] = r;
+            host_isInverted[index] = isInverted;
+            host_isLower[index] = isLower;
+
+            //assigning other variables!
+            if(detId == 1)
+            {
+                host_moduleType[index] = PixelModule;
+                host_moduleLayerType[index] = SDL::InnerPixelLayer;
+                host_slopes[index] = 0;
+                host_drdzs[index] = 0;
+                host_isAnchor[index] = false;
+            }
+            else
+            {
+                host_moduleType[index] = ( m_t == 25 ? SDL::TwoS : SDL::PS );
+                host_moduleLayerType[index] = ( m_t == 23 ? SDL::Pixel : SDL::Strip );
+
+                if(host_moduleType[index] == SDL::PS and host_moduleLayerType[index] == SDL::Pixel)
+                {
+                    host_isAnchor[index] = true;
+                }
+                else if(host_moduleType[index] == SDL::TwoS and host_isLower[index])
+                {
+                    host_isAnchor[index] = true;   
+                }
+                else
+                {
+                    host_isAnchor[index] = false;
+                }
+
+                host_slopes[index] = (subdet == Endcap) ? endcapGeometry.getSlopeLower(detId) : tiltedGeometry.getSlope(detId);
+                host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry.getDrDz(detId) : 0;
+            }
+        }
+
+        //partner module stuff, and slopes and drdz move around
+        for(auto it = (*detIdToIndex).begin(); it != (*detIdToIndex).end(); it++)
+        {
+            auto& detId = it->first;
+            auto& index = it->second;
+            if(detId != 1)
+            {
+                host_partnerModuleIndices[index] = (*detIdToIndex)[modulesInGPU->parsePartnerModuleId(detId, host_isLower[index], host_isInverted[index])];
+                //add drdz and slope importing stuff here!
+                if(host_drdzs[index] == 0)
+                {
+                    host_drdzs[index] = host_drdzs[host_partnerModuleIndices[index]];
+                }
+                if(host_slopes[index] == 0)
+                {
+                    host_slopes[index] = host_slopes[host_partnerModuleIndices[index]];
+                }
+            }
+        }
+
+        cudaMemcpyAsync(modulesInGPU->nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
+        cudaMemcpyAsync(modulesInGPU->nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
+        cudaStreamSynchronize(stream);
+
+        cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream);
+        cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream);
+        cudaStreamSynchronize(stream);
+
+        //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules);
+        //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules);
+
+        alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->rings_buf, rings_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->rods_buf, rods_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->modules_buf, modules_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->subdets_buf, subdets_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->sides_buf, sides_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->eta_buf, eta_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->r_buf, r_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->isInverted_buf, isInverted_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->isLower_buf, isLower_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->isAnchor_buf, isAnchor_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->slopes_buf, slopes_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->drdzs_buf, drdzs_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->partnerModuleIndices_buf, partnerModuleIndices_buf, nModules);
+        alpaka::wait(queue);
+
+        fillConnectedModuleArrayExplicit(modulesBuf, nModules, queue);
+        fillMapArraysExplicit(modulesBuf, nModules, queue);
+        fillPixelMap(modulesBuf, pixelMapping, queue);
+    };
 }
 #endif
diff --git a/SDL/allocate.cc b/SDL/allocate.cc
deleted file mode 100644
index 1e926fb9..00000000
--- a/SDL/allocate.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <limits>
-
-#include "allocate.h"
-#include "cudaCheck.h"
-
-#include "getCachingAllocator.h"
-
-namespace {
-  const size_t maxAllocationSize =
-      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
-}
-
-namespace cms::cuda {
- // void *allocate_managed(unsigned int nbytes, cudaStream_t stream) {
-  void *allocate_managed(size_t nbytes, cudaStream_t stream) {
-    void *ptr = nullptr;
-//    if constexpr (allocator::useCaching) {
-      if (nbytes > maxAllocationSize) {
-        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
-                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
-      }
-      cudaCheck(allocator::getCachingManagedAllocator().ManagedAllocate(&ptr, nbytes, stream));
-//    } else {
-//      cudaCheck(cudaMallocManaged(&ptr, nbytes));
-//    }
-    return ptr;
-  }
-
-  void free_managed(void *ptr) {
-    //if constexpr (allocator::useCaching) {
-      cudaCheck(allocator::getCachingManagedAllocator().ManagedFree(ptr));
-    //} else {
-    //  cudaCheck(cudaFree(ptr));
-    //}
-  }
-
-  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
-    void *ptr = nullptr;
-    if (nbytes > maxAllocationSize) {
-      std::cout<<"at stream"<<stream<<std::endl;
-      throw std::runtime_error("alloate_device : Tried to allocate " + std::to_string(nbytes) +
-                               " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
-    }
-    cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
-    return ptr;
-  }
-
-  void free_device(int device, void *ptr) {
-    cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
-  }
-
-  void *allocate_host(size_t nbytes, cudaStream_t stream) {
-    void *ptr = nullptr;
-    if (nbytes > maxAllocationSize) {
-      throw std::runtime_error("allocate_host: Tried to allocate " + std::to_string(nbytes) +
-                               " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
-    }
-    cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
-    return ptr;
-  }
-
-  void free_host(void *ptr) {
-    cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr));
-  }
-
-}  // namespace cms::cuda
diff --git a/SDL/allocate.h b/SDL/allocate.h
deleted file mode 100644
index 2a3698de..00000000
--- a/SDL/allocate.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_allocate_managed_h
-#define HeterogeneousCore_CUDAUtilities_allocate_managed_h
-
-#include <cuda_runtime.h>
-
-namespace cms {
-  namespace cuda {
-    // Allocate managed memory (to be called from unique_ptr)
-    //void *allocate_managed(unsigned int nbytes, cudaStream_t stream);
-    void *allocate_managed(size_t nbytes, cudaStream_t stream);
-    void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
-    void *allocate_host(size_t nbytes, cudaStream_t stream);
-
-    // Free managed memory (to be called from unique_ptr)
-    void free_managed(void *ptr);
-    void free_device(int dev, void *ptr);
-    void free_host(void *ptr);
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/SDL/cudaCheck.h b/SDL/cudaCheck.h
deleted file mode 100644
index 821bfcff..00000000
--- a/SDL/cudaCheck.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_cudaCheck_h
-#define HeterogeneousCore_CUDAUtilities_cudaCheck_h
-
-// C++ standard headers
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-// CUDA headers
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace cms {
-  namespace cuda {
-
-    [[noreturn]] inline void abortOnCudaError(const char* file,
-                                              int line,
-                                              const char* cmd,
-                                              const char* error,
-                                              const char* message,
-                                              const char* description = nullptr) {
-      std::ostringstream out;
-      out << "\n";
-      out << file << ", line " << line << ":\n";
-      out << "cudaCheck(" << cmd << ");\n";
-      out << error << ": " << message << "\n";
-      if (description)
-        out << description << "\n";
-      throw std::runtime_error(out.str());
-    }
-
-    inline bool cudaCheck_(
-        const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
-      if (result == CUDA_SUCCESS)
-        return true;
-
-      const char* error;
-      const char* message;
-      cuGetErrorName(result, &error);
-      cuGetErrorString(result, &message);
-      abortOnCudaError(file, line, cmd, error, message, description);
-      return false;
-    }
-
-    inline bool cudaCheck_(
-        const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
-      if (result == cudaSuccess)
-        return true;
-
-      const char* error = cudaGetErrorName(result);
-      const char* message = cudaGetErrorString(result);
-      abortOnCudaError(file, line, cmd, error, message, description);
-      return false;
-    }
-
-  }  // namespace cuda
-}  // namespace cms
-
-#define cudaCheck(ARG, ...) (cms::cuda::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
-
-#endif  // HeterogeneousCore_CUDAUtilities_cudaCheck_h
diff --git a/SDL/deviceCount.h b/SDL/deviceCount.h
deleted file mode 100644
index 407f6093..00000000
--- a/SDL/deviceCount.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
-#define HeterogenousCore_CUDAUtilities_deviceCount_h
-
-#include "cudaCheck.h"
-
-#include <cuda_runtime.h>
-
-namespace cms {
-  namespace cuda {
-    inline int deviceCount() {
-      int ndevices;
-      cudaCheck(cudaGetDeviceCount(&ndevices));
-      return ndevices;
-    }
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/SDL/getCachingAllocator.h b/SDL/getCachingAllocator.h
deleted file mode 100644
index c81afecd..00000000
--- a/SDL/getCachingAllocator.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_src_getCachingAllocator
-#define HeterogeneousCore_CUDACore_src_getCachingAllocator
-
-#include "cudaCheck.h"
-#include "deviceCount.h"
-#include "CachingDeviceAllocator.h"
-#include "CachingManagedAllocator.h"
-#include "CachingHostAllocator.h"
-
-namespace cms::cuda::allocator {
-  // Use caching or not
-  constexpr bool useCaching = true;
-  // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 2;//9;
-  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 8;//1;
-  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 30;//10;
-  // Total storage for the allocator. 0 means no limit.
-  constexpr size_t maxCachedBytes = 0;
-  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
-  constexpr double maxCachedFraction = 0.8;
-  constexpr bool debug = false;
-
-  inline size_t minCachedBytes() {
-    size_t ret = std::numeric_limits<size_t>::max();
-    int currentDevice;
-    cudaCheck(cudaGetDevice(&currentDevice));
-    const int numberOfDevices = deviceCount();
-    for (int i = 0; i < numberOfDevices; ++i) {
-      size_t freeMemory, totalMemory;
-      cudaCheck(cudaSetDevice(i));
-      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
-      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
-    }
-    cudaCheck(cudaSetDevice(currentDevice));
-    if (maxCachedBytes > 0) {
-      ret = std::min(ret, maxCachedBytes);
-    }
-    return ret;
-  }
-
-  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
-    // the public interface is thread safe
-    static notcub::CachingDeviceAllocator allocator{binGrowth,
-                                                    minBin,
-                                                    maxBin,
-                                                    minCachedBytes(),
-                                                    false,  // do not skip cleanup
-                                                    debug};
-    return allocator;
-  }
-
- inline notcub::CachingManagedAllocator& getCachingManagedAllocator() {
-    static notcub::CachingManagedAllocator allocator{binGrowth,
-                                                     minBin,
-                                                     maxBin,
-                                                     minCachedBytes(),
-                                                     false,  // do not skip cleanup
-                                                     debug};
-    return allocator;
-  }
-
-  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
-      static notcub::CachingHostAllocator allocator{binGrowth,
-                                                    minBin,
-                                                    maxBin,
-                                                    minCachedBytes(),
-                                                    false,  // do not skip cleanup
-                                                    debug};
-      return allocator;
-    }
-}  // namespace cms::cuda::allocator
-
-#endif
diff --git a/bin/sdl.cc b/bin/sdl.cc
index beb15dc0..74ff1fd4 100644
--- a/bin/sdl.cc
+++ b/bin/sdl.cc
@@ -506,8 +506,6 @@ void run_sdl()
 
     printTimingInformation(timevec, full_elapsed, avg_elapsed);
 
-    SDL::cleanModules();
-
     if (ana.do_write_ntuple)
     {
         // Writing ttree output to file
diff --git a/code/core/AccessHelper.cc b/code/core/AccessHelper.cc
index f139982b..eaa33df4 100644
--- a/code/core/AccessHelper.cc
+++ b/code/core/AccessHelper.cc
@@ -31,7 +31,7 @@ std::vector<unsigned int> getPixelHitsFrompLS(SDL::Event* event, unsigned int pL
     SDL::segmentsBuffer<alpaka::DevCpu>& segments_ = *(event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoublets_ = *(event->getMiniDoublets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     unsigned int MD_1 = segments_.mdIndices[2 * (pLS + pLS_offset)];
     unsigned int MD_2 = segments_.mdIndices[2 * (pLS + pLS_offset) + 1];
@@ -243,7 +243,7 @@ unsigned int getPixelLSFrompT3(SDL::Event* event, unsigned int pT3)
 {
     SDL::pixelTripletsBuffer<alpaka::DevCpu>& pixelTriplets_ = *(event->getPixelTriplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     return pixelTriplets_.pixelSegmentIndices[pT3] - pLS_offset;
 }
@@ -343,7 +343,7 @@ unsigned int getPixelLSFrompT5(SDL::Event* event, unsigned int pT5)
 {
     SDL::pixelQuintupletsBuffer<alpaka::DevCpu>& pixelQuintuplets_ = *(event->getPixelQuintuplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     const unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[*(modulesInGPU.nLowerModules)];
     return pixelQuintuplets_.pixelIndices[pT5] - pLS_offset;
 }
diff --git a/code/core/write_sdl_ntuple.cc b/code/core/write_sdl_ntuple.cc
index 33f90068..eb519b9e 100644
--- a/code/core/write_sdl_ntuple.cc
+++ b/code/core/write_sdl_ntuple.cc
@@ -308,7 +308,7 @@ void setPixelQuintupletOutputBranches(SDL::Event* event)
     SDL::pixelQuintupletsBuffer<alpaka::DevCpu>& pixelQuintupletsInGPU = (*event->getPixelQuintuplets());
     SDL::quintupletsBuffer<alpaka::DevCpu>& quintupletsInGPU = (*event->getQuintuplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
     const float kRinv1GeVf = (2.99792458e-3 * 3.8);
@@ -393,7 +393,7 @@ void setQuintupletOutputBranches(SDL::Event* event)
 {
     SDL::quintupletsBuffer<alpaka::DevCpu>& quintupletsInGPU = (*event->getQuintuplets());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     const float kRinv1GeVf = (2.99792458e-3 * 3.8);
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
 
@@ -475,7 +475,7 @@ void setPixelTripletOutputBranches(SDL::Event* event)
 {
     SDL::pixelTripletsBuffer<alpaka::DevCpu>& pixelTripletsInGPU = (*event->getPixelTriplets());
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = *(event->getTriplets());
-    SDL::modules& modulesInGPU = *(event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = *(event->getModules());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = *(event->getSegments());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = *(event->getHits());
     int n_accepted_simtrk = ana.tx->getBranch<vector<int>>("sim_TC_matched").size();
@@ -562,7 +562,7 @@ void setGnnNtupleBranches(SDL::Event* event)
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     SDL::trackCandidatesBuffer<alpaka::DevCpu>& trackCandidatesInGPU = (*event->getTrackCandidates());
 
@@ -1106,7 +1106,7 @@ float computeRadiusFromThreeAnchorHits(float x1, float y1, float x2, float y2, f
 //________________________________________________________________________________________________________________________________
 void printHitMultiplicities(SDL::Event* event)
 {
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     int nHits = 0;
@@ -1122,7 +1122,7 @@ void printHitMultiplicities(SDL::Event* event)
 void printMiniDoubletMultiplicities(SDL::Event* event)
 {
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
 
     int nMiniDoublets = 0;
     int totOccupancyMiniDoublets = 0;
@@ -1152,7 +1152,7 @@ void printMDs(SDL::Event* event)
 {
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     // Then obtain the lower module index
@@ -1176,7 +1176,7 @@ void printLSs(SDL::Event* event)
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     int nSegments = 0;
@@ -1209,7 +1209,7 @@ void printpLSs(SDL::Event* event)
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
 
     unsigned int i = *(modulesInGPU.nLowerModules);
@@ -1240,7 +1240,7 @@ void printT3s(SDL::Event* event)
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
     SDL::hitsBuffer<alpaka::DevCpu>& hitsInGPU = (*event->getHits());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     int nTriplets = 0;
     for (unsigned int i = 0; i <  *(modulesInGPU.nLowerModules); ++i)
     {
@@ -1282,7 +1282,7 @@ void debugPrintOutlierMultiplicities(SDL::Event* event)
     SDL::tripletsBuffer<alpaka::DevCpu>& tripletsInGPU = (*event->getTriplets());
     SDL::segmentsBuffer<alpaka::DevCpu>& segmentsInGPU = (*event->getSegments());
     SDL::miniDoubletsBuffer<alpaka::DevCpu>& miniDoubletsInGPU = (*event->getMiniDoublets());
-    SDL::modules& modulesInGPU = (*event->getModules());
+    SDL::modulesBuffer<alpaka::DevCpu>& modulesInGPU = (*event->getModules());
     SDL::objectRangesBuffer<alpaka::DevCpu>& rangesInGPU = (*event->getRanges());
     //int nTrackCandidates = 0;
     for (unsigned int idx = 0; idx <= *(modulesInGPU.nLowerModules); ++idx)

From 68b7b7b065f76982c7c01798ebd7577c2a4be00c Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 13:22:48 -0700
Subject: [PATCH 30/44] first working alpaka everything

---
 Makefile             | 16 ++++----
 SDL/Constants.cuh    | 29 +++++++++++---
 SDL/Event.cu         | 95 ++++++++++++++++++++++++++++++++------------
 SDL/Event.cuh        |  5 +--
 SDL/LST.cc           |  5 +--
 SDL/LST.h            |  3 +-
 SDL/Makefile         | 27 +++++--------
 SDL/Module.cuh       | 16 +++++---
 SDL/PixelTriplet.cuh | 12 +++---
 SDL/Quintuplet.cuh   |  6 +--
 bin/sdl.cc           |  6 +--
 setup_cgpu.sh        |  6 +--
 12 files changed, 139 insertions(+), 87 deletions(-)

diff --git a/Makefile b/Makefile
index 44c520d9..c57369d3 100644
--- a/Makefile
+++ b/Makefile
@@ -9,19 +9,19 @@ SOURCES=$(wildcard code/core/*.cc)  #$(wildcard SDL/*.cc)
 OBJECTS=$(SOURCES:.cc=.o) $(wildcard ${TRACKLOOPERDIR}/SDL/libsdl.so)
 HEADERS=$(SOURCES:.cc=.h)
 
-CC          = nvcc
-CXX         = nvcc
-CXXFLAGS    = -g -O2 --compiler-options -Wall --compiler-options -fPIC --compiler-options -Wshadow --compiler-options -Woverloaded-virtual -G -lineinfo  -fopenmp -lgomp --default-stream per-thread
+CC          = g++
+CXX         = g++
+CXXFLAGS    = -g -O2 -Wall -fPIC -std=c++17 -Wshadow -Woverloaded-virtual -lineinfo  -fopenmp -lgomp --default-stream per-thread
 LD          = g++
 LDFLAGS     = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual -I/mnt/data1/dsr/cub
 SOFLAGS     = -g -shared
 CXXFLAGS    = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual
 LDFLAGS     = -g -O2
 ROOTLIBS    = $(shell root-config --libs)
-ROOTCFLAGS  = $(foreach option, $(shell root-config --cflags), --compiler-options $(option))
-ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-CFLAGS      = $(ROOTCFLAGS) --compiler-options -Wall --compiler-options -Wno-unused-function --compiler-options -g --compiler-options -O2 --compiler-options -fPIC --compiler-options -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include --compiler-options -fopenmp
-EXTRACFLAGS = $(shell rooutil-config)
+ROOTCFLAGS  = $(foreach option, $(shell root-config --cflags), $(option))
+ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_DEBUG=0
+CFLAGS      = $(ROOTCFLAGS)  -Wall  -Wno-unused-function  -g  -O2  -fPIC  -fno-var-tracking -ISDL -I$(shell pwd) -Icode  -Icode/core -I/mnt/data1/dsr/cub -I${CUDA_HOME}/include  -fopenmp
+EXTRACFLAGS = $(shell rooutil-config) -g
 EXTRAFLAGS  = -fPIC -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -L${CUDA_HOME}/lib64 -lcudart -fopenmp
 DOQUINTUPLET = -DFP16_Base -DFP16_dPhi #-DFP16_circle -DFP16_seg -DFP16_T5 #-DDO_QUINTUPLET #-DDO_QUADRUPLET
 PT0P8       =
@@ -53,7 +53,7 @@ bin/sdl: bin/sdl.o $(OBJECTS)
 	$(LD) $(PT0P8) $(T3T3EXTENSION) $(LDFLAGS) $^ $(ROOTLIBS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(EXTRAFLAGS) $(DOQUINTUPLET) $(ALPAKAINCLUDE) -o $@
 
 %.o: %.cc
-	$(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -dc -o $@
+	$(CC) $(PT0P8) $(T3T3EXTENSION) $(CFLAGS) $(EXTRACFLAGS) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKAINCLUDE) $< -c -o $@
 
 $(ROOUTIL):
 	$(MAKE) -C code/rooutil/
diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 49412d68..606fe7f0 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -1,14 +1,17 @@
 #ifndef Constants_cuh
 #define Constants_cuh
 
-#include <cuda_fp16.h>
 #include <alpaka/alpaka.hpp>
 
 // CUDA headers. Will be removed soon.
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
 #include <cuda.h>
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#endif
 
-#ifdef FP16_Base //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
+//This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
+#if defined(FP16_Base) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 #define __F2H __float2half  
 #define __H2F __half2float  
 typedef __half FPX;
@@ -17,7 +20,8 @@ typedef __half FPX;
 #define __H2F
 typedef float FPX; 
 #endif
-#ifdef FP16_T5 // changes T5 regression values
+
+#if defined(FP16_T5) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes T5 regression values
 #define __F2H_T5 __float2half  
 #define __H2F_T5 __half2float  
 typedef __half FPX_T5;
@@ -26,7 +30,8 @@ typedef __half FPX_T5;
 #define __H2F_T5
 typedef float FPX_T5; 
 #endif
-#ifdef FP16_dPhi // changes segment dPhi values
+
+#if defined(FP16_dPhi) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment dPhi values
 #define __F2H_dPhi __float2half  
 #define __H2F_dPhi __half2float  
 typedef __half FPX_dPhi;
@@ -35,7 +40,8 @@ typedef __half FPX_dPhi;
 #define __H2F_dPhi
 typedef float FPX_dPhi; 
 #endif
-#ifdef FP16_circle // changes segment circle values
+
+#if defined(FP16_circle) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment circle values
 #define __F2H_circle __float2half  
 #define __H2F_circle __half2float  
 typedef __half FPX_circle;
@@ -44,7 +50,8 @@ typedef __half FPX_circle;
 #define __H2F_circle
 typedef float FPX_circle; 
 #endif
-#ifdef FP16_seg // changes segment values
+
+#if defined(FP16_seg) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED)  // changes segment values
 #define __F2H_seg __float2half  
 #define __H2F_seg __half2float  
 typedef __half FPX_seg;
@@ -78,6 +85,16 @@ Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
     using Acc = alpaka::AccCpuSerial<Dim, Idx>;
 #endif
 
+#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
+struct uint4
+{
+    unsigned int x;
+    unsigned int y;
+    unsigned int z;
+    unsigned int w;
+};
+#endif
+
 auto const devHost = alpaka::getDevByIdx<alpaka::DevCpu>(0u);
 auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
diff --git a/SDL/Event.cu b/SDL/Event.cu
index eab44436..aee9e862 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -6,9 +6,8 @@ std::shared_ptr<SDL::pixelMap> SDL::pixelMapping = std::make_shared<pixelMap>();
 uint16_t SDL::nModules;
 uint16_t SDL::nLowerModules;
 
-SDL::Event::Event(cudaStream_t estream, bool verbose): queue(alpaka::getDevByIdx<Acc>(0u))
+SDL::Event::Event(bool verbose): queue(alpaka::getDevByIdx<Acc>(0u))
 {
-    stream = estream;
     addObjects = verbose;
     hitsInGPU = nullptr;
     mdsInGPU = nullptr;
@@ -152,7 +151,6 @@ void SDL::Event::resetEvent()
 
 void SDL::initModules(const char* moduleMetaDataFilePath)
 {
-    cudaStream_t default_stream = 0;
     QueueAcc queue(devAcc);
 
     // Set the relevant data pointers.
@@ -164,7 +162,6 @@ void SDL::initModules(const char* moduleMetaDataFilePath)
                         nModules,
                         nLowerModules,
                         *pixelMapping,
-                        default_stream,
                         queue,
                         moduleMetaDataFilePath);
 }
@@ -243,12 +240,20 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
 void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,std::vector<unsigned int> hitIndices1,std::vector<unsigned int> hitIndices2,std::vector<unsigned int> hitIndices3, std::vector<float> dPhiChange, std::vector<float> ptIn, std::vector<float> ptErr, std::vector<float> px, std::vector<float> py, std::vector<float> pz, std::vector<float> eta, std::vector<float> etaErr, std::vector<float> phi, std::vector<int> charge, std::vector<unsigned int> seedIdx, std::vector<int> superbin, std::vector<int8_t> pixelType, std::vector<char> isQuad)
 {
     const int size = ptIn.size();
-    unsigned int mdSize = 2 * size;
+    int mdSize = 2 * size;
     uint16_t pixelModuleIndex = (*detIdToIndex)[1];
 
     if(mdsInGPU == nullptr)
     {
-        cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
+        // Create a view for the element nLowerModules inside rangesBuffers->miniDoubletModuleOccupancy
+        auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx) 1u, (Idx) nLowerModules);
+
+        // Create a source view for the value to be set
+        int value = N_MAX_PIXEL_MD_PER_MODULES;
+        auto src_view_value = alpaka::createView(devHost, &value, (Idx) 1u);
+
+        alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value);
+        alpaka::wait(queue);
 
         Vec const threadsPerBlockCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
         Vec const blocksPerGridCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
@@ -265,16 +270,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         alpaka::wait(queue);
 
         unsigned int nTotalMDs;
-        cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        auto nTotalMDs_view = alpaka::createView(devHost, &nTotalMDs, (Idx) 1u);
+
+        alpaka::memcpy(queue, nTotalMDs_view, rangesBuffers->device_nTotalMDs_buf);
+        alpaka::wait(queue);
+
         nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES;
 
         mdsInGPU = new SDL::miniDoublets();
         miniDoubletsBuffers = new SDL::miniDoubletsBuffer<Acc>(nTotalMDs, nLowerModules, devAcc, queue);
         mdsInGPU->setData(*miniDoubletsBuffers);
 
-        cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-        cudaStreamSynchronize(stream);
+        alpaka::memcpy(queue, miniDoubletsBuffers->nMemoryLocations_buf, nTotalMDs_view);
+        alpaka::wait(queue);
     }
     if(segmentsInGPU == nullptr)
     {
@@ -296,16 +304,19 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         alpaka::enqueue(queue, createSegmentArrayRangesTask);
         alpaka::wait(queue);
 
-        cudaMemcpyAsync(&nTotalSegments,rangesInGPU->device_nTotalSegs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-        cudaStreamSynchronize(stream);
+        auto nTotalSegments_view = alpaka::createView(devHost, &nTotalSegments, (Idx) 1u);
+
+        alpaka::memcpy(queue, nTotalSegments_view, rangesBuffers->device_nTotalSegs_buf);
+        alpaka::wait(queue);
+
         nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
 
         segmentsInGPU = new SDL::segments();
         segmentsBuffers = new SDL::segmentsBuffer<Acc>(nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue);
         segmentsInGPU->setData(*segmentsBuffers);
 
-        cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
-        cudaStreamSynchronize(stream);
+        alpaka::memcpy(queue, segmentsBuffers->nMemoryLocations_buf, nTotalSegments_view);
+        alpaka::wait(queue);
     }
 
     auto hitIndices0_dev = allocBufWrapper<unsigned int>(devAcc, size);
@@ -334,11 +345,22 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
     alpaka::memcpy(queue, segmentsBuffers->superbin_buf, superbin, size);
     alpaka::memcpy(queue, segmentsBuffers->pixelType_buf, pixelType, size);
 
-    cudaMemcpyAsync(&(segmentsInGPU->nSegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(&(segmentsInGPU->totOccupancySegments)[pixelModuleIndex], &size, sizeof(int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(&(mdsInGPU->nMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaMemcpyAsync(&(mdsInGPU->totOccupancyMDs)[pixelModuleIndex], &mdSize, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
-    cudaStreamSynchronize(stream);
+    // Create source views for size and mdSize
+    auto src_view_size = alpaka::createView(devHost, &size, (Idx) 1u);
+    auto src_view_mdSize = alpaka::createView(devHost, &mdSize, (Idx) 1u);
+
+    auto dst_view_segments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) pixelModuleIndex);
+    alpaka::memcpy(queue, dst_view_segments, src_view_size);
+
+    auto dst_view_totOccupancySegments = alpaka::createSubView(segmentsBuffers->totOccupancySegments_buf, (Idx) 1u, (Idx) pixelModuleIndex);
+    alpaka::memcpy(queue, dst_view_totOccupancySegments, src_view_size);
+
+    auto dst_view_nMDs = alpaka::createSubView(miniDoubletsBuffers->nMDs_buf, (Idx) 1u, (Idx) pixelModuleIndex);
+    alpaka::memcpy(queue, dst_view_nMDs, src_view_mdSize);
+
+    auto dst_view_totOccupancyMDs = alpaka::createSubView(miniDoubletsBuffers->totOccupancyMDs_buf, (Idx) 1u, (Idx) pixelModuleIndex);
+    alpaka::memcpy(queue, dst_view_totOccupancyMDs, src_view_mdSize);
+
     alpaka::wait(queue);
 
     Vec const threadsPerBlock(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(256));
@@ -381,6 +403,7 @@ void SDL::Event::addMiniDoubletsToEventExplicit()
     alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2);
 
     alpaka::wait(queue);
+
     int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf);
     short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
     short* module_layers = alpaka::getPtrNative(module_layers_buf);
@@ -398,7 +421,6 @@ void SDL::Event::addMiniDoubletsToEventExplicit()
             {
                 n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i];
             }
-
         }
     }
 }
@@ -415,6 +437,7 @@ void SDL::Event::addSegmentsToEventExplicit()
     alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
 
     alpaka::wait(queue);
+
     int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf);
     short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
     short* module_layers = alpaka::getPtrNative(module_layers_buf);
@@ -437,7 +460,15 @@ void SDL::Event::addSegmentsToEventExplicit()
 
 void SDL::Event::createMiniDoublets()
 {
-    cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
+    // Create a view for the element nLowerModules inside rangesBuffers->miniDoubletModuleOccupancy
+    auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx) 1u, (Idx) nLowerModules);
+
+    // Create a source view for the value to be set
+    int value = N_MAX_PIXEL_MD_PER_MODULES;
+    auto src_view_value = alpaka::createView(devHost, &value, (Idx) 1u);
+
+    alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value);
+    alpaka::wait(queue);
 
     Vec const threadsPerBlockCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1024));
     Vec const blocksPerGridCreateMD(static_cast<Idx>(1), static_cast<Idx>(1), static_cast<Idx>(1));
@@ -679,7 +710,7 @@ void SDL::Event::createTrackCandidates()
 
     Vec const threadsPerBlock_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(64));
     Vec const blocksPerGrid_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(4), static_cast<Idx>(20));
-    WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, blocksPerGrid_crossCleanpT3, elementsPerThread);
+    WorkDiv const crossCleanpT3_workDiv(blocksPerGrid_crossCleanpT3, threadsPerBlock_crossCleanpT3, elementsPerThread);
 
     SDL::crossCleanpT3 crossCleanpT3_kernel;
     auto const crossCleanpT3Task(alpaka::createTaskKernel<Acc>(
@@ -811,8 +842,13 @@ void SDL::Event::createPixelTriplets()
         pixelTripletsInGPU->setData(*pixelTripletsBuffers);
     }
 
-    unsigned int nInnerSegments;
-    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(int), cudaMemcpyDeviceToHost,stream);
+    int nInnerSegments;
+    auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t) 1u);
+
+    auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) nLowerModules);
+
+    alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments);
+    alpaka::wait(queue);
 
     auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
@@ -1043,8 +1079,14 @@ void SDL::Event::createPixelQuintuplets()
         trackCandidatesInGPU->setData(*trackCandidatesBuffers);
     }
 
-    unsigned int nInnerSegments;
-    cudaMemcpyAsync(&nInnerSegments, &(segmentsInGPU->nSegments[nLowerModules]), sizeof(unsigned int), cudaMemcpyDeviceToHost,stream);
+    int nInnerSegments;
+    auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t) 1u);
+
+    // Create a sub-view for the device buffer
+    auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx) 1u, (Idx) nLowerModules);
+
+    alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments);
+    alpaka::wait(queue);
 
     auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
@@ -1179,6 +1221,7 @@ void SDL::Event::addQuintupletsToEventExplicit()
     alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules);
 
     alpaka::wait(queue);
+
     int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf);
     short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
     short* module_layers = alpaka::getPtrNative(module_layers_buf);
diff --git a/SDL/Event.cuh b/SDL/Event.cuh
index 9b70014f..948f2207 100644
--- a/SDL/Event.cuh
+++ b/SDL/Event.cuh
@@ -18,7 +18,6 @@ namespace SDL
     {
     private:
         QueueAcc queue;
-        cudaStream_t stream;
         bool addObjects;
 
         std::array<unsigned int, 6> n_hits_by_layer_barrel_;
@@ -35,7 +34,7 @@ namespace SDL
         std::array<unsigned int, 5> n_quintuplets_by_layer_endcap_;
 
         //Device stuff
-        int nTotalSegments;
+        unsigned int nTotalSegments;
         struct objectRanges* rangesInGPU;
         struct objectRangesBuffer<Acc>* rangesBuffers;
         struct hits* hitsInGPU;
@@ -71,7 +70,7 @@ namespace SDL
         int* superbinCPU;
         int8_t* pixelTypeCPU;
     public:
-        Event(cudaStream_t estream,bool verbose);
+        Event(bool verbose);
         void resetEvent();
 
         void addHitToEvent(std::vector<float> x, std::vector<float> y, std::vector<float> z, std::vector<unsigned int> detId, std::vector<unsigned int> idxInNtuple); //call the appropriate hit function, then increment the counter here
diff --git a/SDL/LST.cc b/SDL/LST.cc
index 9f9930c3..977cb642 100644
--- a/SDL/LST.cc
+++ b/SDL/LST.cc
@@ -10,8 +10,7 @@ void SDL::LST::eventSetup() {
     SDL::initModules(path);
 }
 
-void SDL::LST::run(cudaStream_t stream,
-                   bool verbose,
+void SDL::LST::run(bool verbose,
                    const std::vector<float> see_px,
                    const std::vector<float> see_py,
                    const std::vector<float> see_pz,
@@ -31,7 +30,7 @@ void SDL::LST::run(cudaStream_t stream,
                    const std::vector<float> ph2_x,
                    const std::vector<float> ph2_y,
                    const std::vector<float> ph2_z) {
-    auto event = SDL::Event(stream, verbose);
+    auto event = SDL::Event(verbose);
     prepareInput(see_px,
                  see_py,
                  see_pz,
diff --git a/SDL/LST.h b/SDL/LST.h
index 1315a3ae..3225194a 100644
--- a/SDL/LST.h
+++ b/SDL/LST.h
@@ -17,8 +17,7 @@ namespace SDL {
         LST();
 
         void eventSetup();
-        void run(cudaStream_t stream,
-                 bool verbose,
+        void run(bool verbose,
                  const std::vector<float> see_px,
                  const std::vector<float> see_py,
                  const std::vector<float> see_pz,
diff --git a/SDL/Makefile b/SDL/Makefile
index abc9a160..6ac11fb1 100644
--- a/SDL/Makefile
+++ b/SDL/Makefile
@@ -16,36 +16,29 @@ LIB=libsdl.so
 # flags to keep track of
 #
 
-# AMD Opteron and Intel EM64T (64 bit mode) Linux with gcc 3.x
-CXX                  = nvcc
-CXXFLAGS             =  -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I..
-ROOTCFLAGS           = --compiler-options -pthread --compiler-options -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include
-ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-ALPAKAFLAGS          = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr -DALPAKA_DEBUG=0
+CXX                  = g++
+CXXFLAGS             = -g -Wall -Wshadow -std=c++17 -Woverloaded-virtual -fPIC -fopenmp -I..
+CXXFLAGS_CUDA        =  -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I..
+ROOTCFLAGS           = -pthread -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include
+ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -std=c++17
+ALPAKAFLAGS          = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr
 LD                   = nvcc 
 SOFLAGS              = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72
 PRINTFLAG            = -DT4FromT3 #-DWarnings
 DUPLICATES           = -DDUP_pLS -DDUP_T5 -DDUP_pT5 -DDUP_pT3 -DCrossclean_T5 -DCrossclean_pT3 -DFP16_Base -DFP16_dPhi
 MEMFLAG              =
 CACHEFLAG            =
-CUDALAUNCHFLAG       =  
 MEMFLAG_FLAGS        =
 CACHEFLAG_FLAGS      = -DCACHE_ALLOC
-CUDALAUNCHFLAG_FLAGS = 
-PT0P8       =
-PRELOAD       =
-CMSSW12GEOM =
-T3T3EXTENSION=
-#
-# how to make it 
-#
+
 CUTVALUEFLAG = 
 CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG
+
 %_cuda.o : %.cu %.cuh
-	$(LD) -x cu $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@
+	$(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@
 
 %_cpu.o : %.cc %.h
-	$(LD) -O2   $(PT0P8) $(PRELOAD) $(T3T3EXTENSION) $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUDALAUNCHFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
+	$(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
 
 $(LIB):$(CCOBJECTS) $(CUOBJECTS)
 #$(LIB):$(CUOBJECTS)
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index 1015031c..225b147d 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -606,7 +606,6 @@ namespace SDL
                              uint16_t& nModules,
                              uint16_t& nLowerModules,
                              struct pixelMap& pixelMapping,
-                             cudaStream_t stream,
                              TQueue& queue,
                              const char* moduleMetaDataFilePath)
     {
@@ -656,8 +655,8 @@ namespace SDL
                 if(count_number>4)
                     break;
             }
-
         }
+
         (*detIdToIndex)[1] = counter; //pixel module is the last module in the module list
         counter++;
         nModules = counter;
@@ -814,13 +813,20 @@ namespace SDL
             }
         }
 
-        cudaMemcpyAsync(modulesInGPU->nModules,&nModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-        cudaMemcpyAsync(modulesInGPU->nLowerModules,&nLowerModules,sizeof(uint16_t),cudaMemcpyHostToDevice,stream);
-        cudaStreamSynchronize(stream);
+        auto src_view_nModules = alpaka::createView(devHost, &nModules, (Idx) 1u);
+        alpaka::memcpy(queue, modulesBuf->nModules_buf, src_view_nModules);
+
+        auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (Idx) 1u);
+        alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules);
 
+        alpaka::wait(queue);
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        cudaStream_t stream = 0;
         cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream);
         cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream);
         cudaStreamSynchronize(stream);
+#endif
 
         //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules);
         //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules);
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 033dea78..0cf22db4 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -568,7 +568,7 @@ namespace SDL
         }
 
         float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius;
-        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0);
+        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f);
 
         float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError));
         float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError));
@@ -589,7 +589,7 @@ namespace SDL
         }
 
         float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius;
-        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0);
+        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f);
 
         float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError));
         float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError));
@@ -610,11 +610,11 @@ namespace SDL
         }
 
         float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius;
-        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0);
+        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f);
 
         float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError));
         float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError));
-        pixelRadiusInvMin = alpaka::math::max(acc, pixelRadiusInvMin, 0);
+        pixelRadiusInvMin = alpaka::math::max(acc, pixelRadiusInvMin, 0.0f);
 
         return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax);
     };
@@ -632,11 +632,11 @@ namespace SDL
         }
 
         float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound)/tripletRadius;
-        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0);
+        float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound)/tripletRadius, 0.0f);
 
         float pixelRadiusInvMax = alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius - pixelRadiusError));
         float pixelRadiusInvMin = alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound)/pixelRadius, 1.f/(pixelRadius + pixelRadiusError));
-        pixelRadiusInvMin = alpaka::math::max(acc, 0, pixelRadiusInvMin);
+        pixelRadiusInvMin = alpaka::math::max(acc, 0.0f, pixelRadiusInvMin);
 
         return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax);
     };
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index 6bf87ca0..25588ed2 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -538,8 +538,8 @@ namespace SDL
                 float diffz1 = (solz1-zsi)*100;
                 float diffz2 = (solz2-zsi)*100;
                 // Alpaka : Needs to be moved over
-                if (isnan(diffz1)) diffz = diffz2;
-                else if (isnan(diffz2)) diffz = diffz1;
+                if (alpaka::math::isnan(acc, diffz1)) diffz = diffz2;
+                else if (alpaka::math::isnan(acc, diffz2)) diffz = diffz1;
                 else {diffz = (alpaka::math::abs(acc, diffz1)<alpaka::math::abs(acc, diffz2)) ? diffz1 : diffz2;}
             }
             residual = (layeri>6) ? diffr : diffz ;
@@ -584,7 +584,7 @@ namespace SDL
         // for set rzchi2 cut
         // if the 5 points are linear, helix calculation gives nan
         // Alpaka : Needs to be moved over
-        if (inner_pt > 100 || isnan(rzChiSquared))
+        if (inner_pt > 100 || alpaka::math::isnan(acc, rzChiSquared))
         {
             float slope;
             if(moduleType1 == 0 and moduleType2 == 0 and moduleType3 == 1) //PSPS2S
diff --git a/bin/sdl.cc b/bin/sdl.cc
index 74ff1fd4..eeb82242 100644
--- a/bin/sdl.cc
+++ b/bin/sdl.cc
@@ -382,13 +382,10 @@ void run_sdl()
         file_name.push_back(ana.looper.getCurrentFileName());
     }
 
-    cudaStream_t streams[ana.streams];
     std::vector<SDL::Event*> events;
     for (int s = 0; s < ana.streams; s++)
     {
-
-        cudaStreamCreateWithFlags(&streams[s], cudaStreamNonBlocking);
-        SDL::Event *event = new SDL::Event(streams[s],ana.verbose>=2);
+        SDL::Event *event = new SDL::Event(ana.verbose>=2);
         events.push_back(event);
     }
 
@@ -517,7 +514,6 @@ void run_sdl()
     for (int s = 0; s < ana.streams; s++)
     {
         delete events.at(s);
-        cudaStreamDestroy(streams[s]);
     }
 
     delete ana.output_tfile;
diff --git a/setup_cgpu.sh b/setup_cgpu.sh
index 2a5392dd..a30c0bf8 100644
--- a/setup_cgpu.sh
+++ b/setup_cgpu.sh
@@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.5.0/etc/profile.d/init.sh
-export BOOST_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/boost/1.72.0-ghbfee3"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.7.0-09bef105568314b218f2a8410a876785"
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
+export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
 #eof

From 62e3dc329bd0d05c6f4e02184bc01a35fd8c333f Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 13:54:06 -0700
Subject: [PATCH 31/44] move to more recent alpaka version

---
 setup_cgpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup_cgpu.sh b/setup_cgpu.sh
index a30c0bf8..fbff025e 100644
--- a/setup_cgpu.sh
+++ b/setup_cgpu.sh
@@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
 export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
 #eof

From 386d40a6d621c3504387ca4d9b36d2542f174b0d Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 14:09:50 -0700
Subject: [PATCH 32/44] remove last cuda

---
 SDL/Module.cuh | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index 225b147d..1855aee0 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -819,17 +819,8 @@ namespace SDL
         auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (Idx) 1u);
         alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules);
 
-        alpaka::wait(queue);
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-        cudaStream_t stream = 0;
-        cudaMemcpyAsync(modulesInGPU->moduleType,host_moduleType,sizeof(ModuleType)*nModules,cudaMemcpyHostToDevice,stream);
-        cudaMemcpyAsync(modulesInGPU->moduleLayerType,host_moduleLayerType,sizeof(ModuleLayerType)*nModules,cudaMemcpyHostToDevice,stream);
-        cudaStreamSynchronize(stream);
-#endif
-
-        //alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf, nModules);
-        //alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf, nModules);
+        alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf);
+        alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf);
 
         alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf, nModules);
         alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf, nModules);

From 03948d77ae9958dcf94dea79165e11cc27fa31d4 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 18:56:41 -0700
Subject: [PATCH 33/44] beginning integration of cmssw alpaka interface/caching
 allocator

---
 SDL/Constants.cuh                             |   7 +
 SDL/Event.cu                                  |  26 +-
 SDL/Makefile                                  |   9 +-
 code/alpaka_interface/AllocatorConfig.h       |  31 ++
 code/alpaka_interface/AllocatorPolicy.h       |  53 +++
 code/alpaka_interface/AlpakaServiceFwd.h      |  33 ++
 code/alpaka_interface/CachedBufAlloc.h        | 207 +++++++++
 code/alpaka_interface/CachingAllocator.h      | 436 ++++++++++++++++++
 code/alpaka_interface/HostOnlyTask.h          |  71 +++
 code/alpaka_interface/ScopedContextFwd.h      |  35 ++
 code/alpaka_interface/config.h                | 164 +++++++
 code/alpaka_interface/devices.h               |  43 ++
 .../getDeviceCachingAllocator.h               |  88 ++++
 .../getHostCachingAllocator.h                 |  32 ++
 code/alpaka_interface/host.h                  |  29 ++
 code/alpaka_interface/memory.h                | 247 ++++++++++
 code/alpaka_interface/stringize.h             |   8 +
 code/alpaka_interface/thread_safety_macros.h  |  12 +
 code/alpaka_interface/traits.h                |  69 +++
 19 files changed, 1583 insertions(+), 17 deletions(-)
 create mode 100644 code/alpaka_interface/AllocatorConfig.h
 create mode 100644 code/alpaka_interface/AllocatorPolicy.h
 create mode 100644 code/alpaka_interface/AlpakaServiceFwd.h
 create mode 100644 code/alpaka_interface/CachedBufAlloc.h
 create mode 100644 code/alpaka_interface/CachingAllocator.h
 create mode 100644 code/alpaka_interface/HostOnlyTask.h
 create mode 100644 code/alpaka_interface/ScopedContextFwd.h
 create mode 100644 code/alpaka_interface/config.h
 create mode 100644 code/alpaka_interface/devices.h
 create mode 100644 code/alpaka_interface/getDeviceCachingAllocator.h
 create mode 100644 code/alpaka_interface/getHostCachingAllocator.h
 create mode 100644 code/alpaka_interface/host.h
 create mode 100644 code/alpaka_interface/memory.h
 create mode 100644 code/alpaka_interface/stringize.h
 create mode 100644 code/alpaka_interface/thread_safety_macros.h
 create mode 100644 code/alpaka_interface/traits.h

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 606fe7f0..68981441 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -2,6 +2,7 @@
 #define Constants_cuh
 
 #include <alpaka/alpaka.hpp>
+#include "../code/alpaka_interface/CachedBufAlloc.h"
 
 // CUDA headers. Will be removed soon.
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
@@ -103,11 +104,17 @@ using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
 template<typename TAcc, typename TData>
 using Buf = alpaka::Buf<TAcc, TData, Dim1d, Idx>;
 
+template<typename T, typename TAcc, typename TSize, typename TQueue>
+ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf<TAcc, T> allocBufWrapper(TAcc const & devAccIn, TSize nElements, TQueue queue) {
+    return cms::alpakatools::allocCachedBuf<T, Idx>(devAccIn, queue, Vec1d(static_cast<Idx>(nElements)));
+}
+
 template<typename T, typename TAcc, typename TSize>
 ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf<TAcc, T> allocBufWrapper(TAcc const & devAccIn, TSize nElements) {
     return alpaka::allocBuf<T, Idx>(devAccIn, Vec1d(static_cast<Idx>(nElements)));
 }
 
+
 const unsigned int MAX_BLOCKS = 80;
 const unsigned int MAX_CONNECTED_MODULES = 40;
 const unsigned int N_MAX_PIXEL_MD_PER_MODULES = 100000;
diff --git a/SDL/Event.cu b/SDL/Event.cu
index aee9e862..e20c9a01 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -319,11 +319,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         alpaka::wait(queue);
     }
 
-    auto hitIndices0_dev = allocBufWrapper<unsigned int>(devAcc, size);
-    auto hitIndices1_dev = allocBufWrapper<unsigned int>(devAcc, size);
-    auto hitIndices2_dev = allocBufWrapper<unsigned int>(devAcc, size);
-    auto hitIndices3_dev = allocBufWrapper<unsigned int>(devAcc, size);
-    auto dPhiChange_dev = allocBufWrapper<float>(devAcc, size);
+    auto hitIndices0_dev = allocBufWrapper<unsigned int>(devAcc, size, queue);
+    auto hitIndices1_dev = allocBufWrapper<unsigned int>(devAcc, size, queue);
+    auto hitIndices2_dev = allocBufWrapper<unsigned int>(devAcc, size, queue);
+    auto hitIndices3_dev = allocBufWrapper<unsigned int>(devAcc, size, queue);
+    auto dPhiChange_dev = allocBufWrapper<float>(devAcc, size, queue);
 
     alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size);
     alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size);
@@ -623,7 +623,7 @@ void SDL::Event::createTriplets()
     uint16_t *index = alpaka::getPtrNative(index_buf);
 
     // Allocate device index
-    auto index_gpu_buf = allocBufWrapper<uint16_t>(devAcc, nLowerModules);
+    auto index_gpu_buf = allocBufWrapper<uint16_t>(devAcc, nLowerModules, queue);
 
     // Allocate and copy nSegments from device to host
     auto nSegments_buf = allocBufWrapper<int>(devHost, nLowerModules);
@@ -648,7 +648,7 @@ void SDL::Event::createTriplets()
             index[nonZeroModules] = innerLowerModuleIndex;
             nonZeroModules++;
         }
-        max_InnerSeg = max(max_InnerSeg, nInnerSegments);
+        max_InnerSeg = std::max(max_InnerSeg, nInnerSegments);
     }
 
     // Copy index from host to device
@@ -741,7 +741,7 @@ void SDL::Event::createTrackCandidates()
     alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask);
 
     Vec const threadsPerBlockRemoveDupQuints(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(32));
-    Vec const blocksPerGridRemoveDupQuints(static_cast<Idx>(1), static_cast<Idx>(max(nEligibleModules/16,1)), static_cast<Idx>(max(nEligibleModules/32,1)));
+    Vec const blocksPerGridRemoveDupQuints(static_cast<Idx>(1), static_cast<Idx>(std::max(nEligibleModules/16,1)), static_cast<Idx>(std::max(nEligibleModules/32,1)));
     WorkDiv const removeDupQuintupletsInGPUBeforeTC_workDiv(blocksPerGridRemoveDupQuints, threadsPerBlockRemoveDupQuints, elementsPerThread);
 
     SDL::removeDupQuintupletsInGPUBeforeTC removeDupQuintupletsInGPUBeforeTC_kernel;
@@ -859,8 +859,8 @@ void SDL::Event::createPixelTriplets()
 
     auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
     auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
-    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
-    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
+    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
+    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
 
     int* superbins = alpaka::getPtrNative(superbins_buf);
     int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf);
@@ -994,7 +994,7 @@ void SDL::Event::createQuintuplets()
     }
 
     Vec const threadsPerBlockQuints(static_cast<Idx>(1), static_cast<Idx>(8), static_cast<Idx>(32));
-    Vec const blocksPerGridQuints(static_cast<Idx>(max(nEligibleT5Modules,1)), static_cast<Idx>(1), static_cast<Idx>(1));
+    Vec const blocksPerGridQuints(static_cast<Idx>(std::max((int) nEligibleT5Modules, 1)), static_cast<Idx>(1), static_cast<Idx>(1));
     WorkDiv const createQuintupletsInGPUv2_workDiv(blocksPerGridQuints, threadsPerBlockQuints, elementsPerThread);
 
     SDL::createQuintupletsInGPUv2 createQuintupletsInGPUv2_kernel;
@@ -1097,8 +1097,8 @@ void SDL::Event::createPixelQuintuplets()
 
     auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
     auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
-    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
-    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments);
+    auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
+    auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
 
     int* superbins = alpaka::getPtrNative(superbins_buf);
     int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf);
diff --git a/SDL/Makefile b/SDL/Makefile
index 6ac11fb1..0c1a1cdf 100644
--- a/SDL/Makefile
+++ b/SDL/Makefile
@@ -20,8 +20,9 @@ CXX                  = g++
 CXXFLAGS             = -g -Wall -Wshadow -std=c++17 -Woverloaded-virtual -fPIC -fopenmp -I..
 CXXFLAGS_CUDA        =  -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared -arch=compute_70 -I/mnt/data1/dsr/cub --use_fast_math --default-stream per-thread -I..
 ROOTCFLAGS           = -pthread -std=c++17 -m64 -I/cvmfs/cms.cern.ch/slc7_amd64_gcc900/cms/cmssw/CMSSW_11_2_0_pre5/external/slc7_amd64_gcc900/bin/../../../../../../../slc7_amd64_gcc900/lcg/root/6.20.06-ghbfee3/include
-ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -std=c++17
-ALPAKAFLAGS          = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY  --expt-relaxed-constexpr
+ALPAKAINCLUDE        = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include --std=c++17
+ALPAKASERIAL         = -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+ALPAKACUDA           = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr
 LD                   = nvcc 
 SOFLAGS              = -g -shared --compiler-options -fPIC --cudart shared -arch=compute_70 -code=sm_72
 PRINTFLAG            = -DT4FromT3 #-DWarnings
@@ -35,10 +36,10 @@ CUTVALUEFLAG =
 CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG
 
 %_cuda.o : %.cu %.cuh
-	$(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKAFLAGS) $< -o $@
+	$(LD) -x cu $(CXXFLAGS_CUDA) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKACUDA) $< -o $@
 
 %_cpu.o : %.cc %.h
-	$(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $< -o $@
+	$(CXX) -c -O2 $(CXXFLAGS) $(LDFLAGS) $(ROOTLIBS) $(MEMFLAG) $(PRINTFLAG) $(CACHEFLAG) $(DUPLICATES) $(ROOTCFLAGS) $(ALPAKAINCLUDE) $(ALPAKASERIAL) $< -o $@
 
 $(LIB):$(CCOBJECTS) $(CUOBJECTS)
 #$(LIB):$(CUOBJECTS)
diff --git a/code/alpaka_interface/AllocatorConfig.h b/code/alpaka_interface/AllocatorConfig.h
new file mode 100644
index 00000000..83b5214a
--- /dev/null
+++ b/code/alpaka_interface/AllocatorConfig.h
@@ -0,0 +1,31 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h
+#define HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h
+
+#include <limits>
+
+namespace cms::alpakatools {
+
+  namespace config {
+
+    // bin growth factor (bin_growth in cub::CachingDeviceAllocator)
+    constexpr unsigned int binGrowth = 2;
+
+    // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CachingDeviceAllocator
+    constexpr unsigned int minBin = 8;  // 256 bytes
+
+    // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+    constexpr unsigned int maxBin = 30;  // 1 GB
+
+    // total storage for the allocator; 0 means no limit.
+    constexpr size_t maxCachedBytes = 0;
+
+    // fraction of total device memory taken for the allocator; 0 means no limit.
+    constexpr double maxCachedFraction = 0.8;
+
+    // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.
+
+  }  // namespace config
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_AllocatorConfig_h
diff --git a/code/alpaka_interface/AllocatorPolicy.h b/code/alpaka_interface/AllocatorPolicy.h
new file mode 100644
index 00000000..16bf3652
--- /dev/null
+++ b/code/alpaka_interface/AllocatorPolicy.h
@@ -0,0 +1,53 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h
+#define HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  // Which memory allocator to use
+  //   - Synchronous:   (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost
+  //   - Asynchronous:  (device only)     cudaMallocAsync (requires CUDA >= 11.2)
+  //   - Caching:       (device and host) caching allocator
+  enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
+
+  template <typename TDev, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+  constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
+
+#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#if defined ALPAKA_ACC_GPU_CUDA_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
+      AllocatorPolicy::Asynchronous;
+#else
+          AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#if defined ALPAKA_ACC_GPU_HIP_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_AllocatorPolicy_h
diff --git a/code/alpaka_interface/AlpakaServiceFwd.h b/code/alpaka_interface/AlpakaServiceFwd.h
new file mode 100644
index 00000000..4345f3f3
--- /dev/null
+++ b/code/alpaka_interface/AlpakaServiceFwd.h
@@ -0,0 +1,33 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h
+#define HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h
+
+// Forward declaration of the alpaka accelerator namespaces and of the AlpakaService for each of them.
+//
+// This file is under HeterogeneousCore/AlpakaInterface to avoid introducing a dependency on
+// HeterogeneousCore/AlpakaServices and HeterogeneousCore/AlpakaCore.
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+namespace alpaka_cuda_async {
+  class AlpakaService;
+}  // namespace alpaka_cuda_async
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+namespace alpaka_hip_async {
+  class AlpakaService;
+}  // namespace alpaka_hip_async
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+namespace alpaka_serial_sync {
+  class AlpakaService;
+}  // namespace alpaka_serial_sync
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+namespace alpaka_tbb_async {
+  class AlpakaService;
+}  // namespace alpaka_tbb_async
+#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_AlpakaServiceFwd_h
diff --git a/code/alpaka_interface/CachedBufAlloc.h b/code/alpaka_interface/CachedBufAlloc.h
new file mode 100644
index 00000000..c5d7eec3
--- /dev/null
+++ b/code/alpaka_interface/CachedBufAlloc.h
@@ -0,0 +1,207 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
+#define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "getDeviceCachingAllocator.h"
+#include "getHostCachingAllocator.h"
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  namespace traits {
+
+    //! The caching memory allocator trait.
+    template <typename TElem,
+              typename TDim,
+              typename TIdx,
+              typename TDev,
+              typename TQueue,
+              typename = void,
+              typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev> and cms::alpakatools::is_queue_v<TQueue>>>
+    struct CachedBufAlloc {
+      static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
+    };
+
+    //! The caching memory allocator implementation for the CPU device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        // non-cached, queue-ordered asynchronous host-only memory
+        return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
+      }
+    };
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueCudaRtBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueCudaRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the CUDA device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCudaRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueHipRtBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueHipRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the ROCm/HIP device
+    template <typename TElem,
+              typename TDim,
+              typename TIdx,
+              typename TQueue,
+              typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufHipRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufHipRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  }  // namespace traits
+
+  template <typename TElem,
+            typename TIdx,
+            typename TExtent,
+            typename TQueue,
+            typename TDev,
+            typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev> and cms::alpakatools::is_queue_v<TQueue>>>
+  ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
+    return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
new file mode 100644
index 00000000..72a52694
--- /dev/null
+++ b/code/alpaka_interface/CachingAllocator.h
@@ -0,0 +1,436 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
+#define HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
+
+#include <cassert>
+#include <exception>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "traits.h"
+#include "AlpakaServiceFwd.h"
+
+// Inspired by cub::CachingDeviceAllocator
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    inline constexpr unsigned int power(unsigned int base, unsigned int exponent) {
+      unsigned int power = 1;
+      while (exponent > 0) {
+        if (exponent & 1) {
+          power = power * base;
+        }
+        base = base * base;
+        exponent = exponent >> 1;
+      }
+      return power;
+    }
+
+    // format a memory size in B/kB/MB/GB
+    inline std::string as_bytes(size_t value) {
+      if (value == std::numeric_limits<size_t>::max()) {
+        return "unlimited";
+      } else if (value >= (1 << 30) and value % (1 << 30) == 0) {
+        return std::to_string(value >> 30) + " GB";
+      } else if (value >= (1 << 20) and value % (1 << 20) == 0) {
+        return std::to_string(value >> 20) + " MB";
+      } else if (value >= (1 << 10) and value % (1 << 10) == 0) {
+        return std::to_string(value >> 10) + " kB";
+      } else {
+        return std::to_string(value) + "  B";
+      }
+    }
+
+  }  // namespace detail
+
+  /*
+   * The "memory device" identifies the memory space, i.e. the device where the memory is allocated.
+   * A caching allocator object is associated to a single memory `Device`, set at construction time, and unchanged for
+   * the lifetime of the allocator.
+   *
+   * Each allocation is associated to an event on a queue, that identifies the "synchronisation device" according to
+   * which the synchronisation occurs.
+   * The `Event` type depends only on the synchronisation `Device` type.
+   * The `Queue` type depends on the synchronisation `Device` type and the queue properties, either `Sync` or `Async`.
+   *
+   * **Note**: how to handle different queue and event types in a single allocator ?  store and access type-punned
+   * queues and events ?  or template the internal structures on them, but with a common base class ?
+   * alpaka does rely on the compile-time type for dispatch.
+   *
+   * Common use case #1: accelerator's memory allocations
+   *   - the "memory device" is the accelerator device (e.g. a GPU);
+   *   - the "synchronisation device" is the same accelerator device;
+   *   - the `Queue` type is usually always the same (either `Sync` or `Async`).
+   *
+   * Common use case #2: pinned host memory allocations
+   *    - the "memory device" is the host device (e.g. system memory);
+   *    - the "synchronisation device" is the accelerator device (e.g. a GPU) whose work queue will access the host;
+   *      memory (direct memory access from the accelerator, or scheduling `alpaka::memcpy`/`alpaka::memset`), and can
+   *      be different for each allocation;
+   *    - the synchronisation `Device` _type_ could potentially be different, but memory pinning is currently tied to
+   *      the accelerator's platform (CUDA, HIP, etc.), so the device type needs to be fixed to benefit from caching;
+   *    - the `Queue` type can be either `Sync` _or_ `Async` on any allocation.
+   */
+
+  template <typename TDev,
+            typename TQueue,
+            typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev> and cms::alpakatools::is_queue_v<TQueue>>>
+  class CachingAllocator {
+  public:
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    friend class alpaka_cuda_async::AlpakaService;
+#endif
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    friend class alpaka_hip_async::AlpakaService;
+#endif
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+    friend class alpaka_serial_sync::AlpakaService;
+#endif
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+    friend class alpaka_tbb_async::AlpakaService;
+#endif
+
+    using Device = TDev;                 // the "memory device", where the memory will be allocated
+    using Queue = TQueue;                // the queue used to submit the memory operations
+    using Event = alpaka::Event<Queue>;  // the events used to synchronise the operations
+    using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
+
+    // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
+    static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                  "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
+                  "host CPU.");
+
+    struct CachedBytes {
+      size_t free = 0;       // total bytes freed and cached on this device
+      size_t live = 0;       // total bytes currently in use oin this device
+      size_t requested = 0;  // total bytes requested and currently in use on this device
+    };
+
+    explicit CachingAllocator(
+        Device const& device,
+        unsigned int binGrowth,          // bin growth factor;
+        unsigned int minBin,             // smallest bin, corresponds to binGrowth^minBin bytes;
+                                         // smaller allocations are rounded to this value;
+        unsigned int maxBin,             // largest bin, corresponds to binGrowth^maxBin bytes;
+                                         // larger allocations will fail;
+        size_t maxCachedBytes,           // total storage for the allocator (0 means no limit);
+        double maxCachedFraction,        // fraction of total device memory taken for the allocator (0 means no limit);
+                                         // if both maxCachedBytes and maxCachedFraction are non-zero,
+                                         // the smallest resulting value is used.
+        bool reuseSameQueueAllocations,  // reuse non-ready allocations if they are in the same queue as the new one;
+                                         // this is safe only if all memory operations are scheduled in the same queue
+        bool debug)
+        : device_(device),
+          binGrowth_(binGrowth),
+          minBin_(minBin),
+          maxBin_(maxBin),
+          minBinBytes_(detail::power(binGrowth, minBin)),
+          maxBinBytes_(detail::power(binGrowth, maxBin)),
+          maxCachedBytes_(cacheSize(maxCachedBytes, maxCachedFraction)),
+          reuseSameQueueAllocations_(reuseSameQueueAllocations),
+          debug_(debug) {
+      if (debug_) {
+        std::ostringstream out;
+        out << "CachingAllocator settings\n"
+            << "  bin growth " << binGrowth_ << "\n"
+            << "  min bin    " << minBin_ << "\n"
+            << "  max bin    " << maxBin_ << "\n"
+            << "  resulting bins:\n";
+        for (auto bin = minBin_; bin <= maxBin_; ++bin) {
+          auto binSize = detail::power(binGrowth, bin);
+          out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
+        }
+        out << "  maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
+        std::cout << out.str() << std::endl;
+      }
+    }
+
+    ~CachingAllocator() {
+      {
+        // this should never be called while some memory blocks are still live
+        std::scoped_lock lock(mutex_);
+        assert(liveBlocks_.empty());
+        assert(cachedBytes_.live == 0);
+      }
+
+      freeAllCached();
+    }
+
+    // return a copy of the cache allocation status, for monitoring purposes
+    CachedBytes cacheStatus() const {
+      std::scoped_lock lock(mutex_);
+      return cachedBytes_;
+    }
+
+    // Allocate given number of bytes on the current device associated to given queue
+    void* allocate(size_t bytes, Queue queue) {
+      // create a block descriptor for the requested allocation
+      BlockDescriptor block;
+      block.queue = std::move(queue);
+      block.requested = bytes;
+      std::tie(block.bin, block.bytes) = findBin(bytes);
+
+      // try to re-use a cached block, or allocate a new buffer
+      if (not tryReuseCachedBlock(block)) {
+        allocateNewBlock(block);
+      }
+
+      return block.buffer->data();
+    }
+
+    // frees an allocation
+    void free(void* ptr) {
+      std::scoped_lock lock(mutex_);
+
+      auto iBlock = liveBlocks_.find(ptr);
+      if (iBlock == liveBlocks_.end()) {
+        std::stringstream ss;
+        ss << "Trying to free a non-live block at " << ptr;
+        throw std::runtime_error(ss.str());
+      }
+      // remove the block from the list of live blocks
+      BlockDescriptor block = std::move(iBlock->second);
+      liveBlocks_.erase(iBlock);
+      cachedBytes_.live -= block.bytes;
+      cachedBytes_.requested -= block.requested;
+
+      bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
+      if (recache) {
+        alpaka::enqueue(*(block.queue), *(block.event));
+        cachedBytes_.free += block.bytes;
+        // after the call to insert(), cachedBlocks_ shares ownership of the buffer
+        // TODO use std::move ?
+        cachedBlocks_.insert(std::make_pair(block.bin, block));
+
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at "
+              << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
+              << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+      } else {
+        // if the buffer is not recached, it is automatically freed when block goes out of scope
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
+              << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
+              << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+      }
+    }
+
+  private:
+    struct BlockDescriptor {
+      std::optional<Buffer> buffer;
+      std::optional<Queue> queue;
+      std::optional<Event> event;
+      size_t bytes = 0;
+      size_t requested = 0;  // for monitoring only
+      unsigned int bin = 0;
+
+      // the "synchronisation device" for this block
+      auto device() { return alpaka::getDev(*queue); }
+    };
+
+  private:
+    // return the maximum amount of memory that should be cached on this device
+    size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const {
+      // note that getMemBytes() returns 0 if the platform does not support querying the device memory
+      size_t totalMemory = alpaka::getMemBytes(device_);
+      size_t memoryFraction = static_cast<size_t>(maxCachedFraction * totalMemory);
+      size_t size = std::numeric_limits<size_t>::max();
+      if (maxCachedBytes > 0 and maxCachedBytes < size) {
+        size = maxCachedBytes;
+      }
+      if (memoryFraction > 0 and memoryFraction < size) {
+        size = memoryFraction;
+      }
+      return size;
+    }
+
+    // return (bin, bin size)
+    std::tuple<unsigned int, size_t> findBin(size_t bytes) const {
+      if (bytes < minBinBytes_) {
+        return std::make_tuple(minBin_, minBinBytes_);
+      }
+      if (bytes > maxBinBytes_) {
+        throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
+                                 " bytes is too large for the caching detail with maximum bin " +
+                                 std::to_string(maxBinBytes_) +
+                                 " bytes. You might want to increase the maximum bin size");
+      }
+      unsigned int bin = minBin_;
+      size_t binBytes = minBinBytes_;
+      while (binBytes < bytes) {
+        ++bin;
+        binBytes *= binGrowth_;
+      }
+      return std::make_tuple(bin, binBytes);
+    }
+
+    bool tryReuseCachedBlock(BlockDescriptor& block) {
+      std::scoped_lock lock(mutex_);
+
+      // iterate through the range of cached blocks in the same bin
+      const auto [begin, end] = cachedBlocks_.equal_range(block.bin);
+      for (auto iBlock = begin; iBlock != end; ++iBlock) {
+        if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or
+            alpaka::isComplete(*(iBlock->second.event))) {
+          // associate the cached buffer to the new queue
+          auto queue = std::move(*(block.queue));
+          // TODO cache (or remove) the debug information and use std::move()
+          block = iBlock->second;
+          block.queue = std::move(queue);
+
+          // if the new queue is on different device than the old event, create a new event
+          if (block.device() != alpaka::getDev(*(block.event))) {
+            block.event = Event{block.device()};
+          }
+
+          // insert the cached block into the live blocks
+          // TODO cache (or remove) the debug information and use std::move()
+          liveBlocks_[block.buffer->data()] = block;
+
+          // update the accounting information
+          cachedBytes_.free -= block.bytes;
+          cachedBytes_.live += block.bytes;
+          cachedBytes_.requested += block.requested;
+
+          if (debug_) {
+            std::ostringstream out;
+            out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
+                << block.buffer->data() << " (" << block.bytes << " bytes) for queue "
+                << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
+                << " (previously associated with queue " << iBlock->second.queue->m_spQueueImpl.get() << " , event "
+                << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
+            std::cout << out.str() << std::endl;
+          }
+
+          // remove the reused block from the list of cached blocks
+          cachedBlocks_.erase(iBlock);
+          return true;
+        }
+      }
+
+      return false;
+    }
+
+    Buffer allocateBuffer(size_t bytes, Queue const& queue) {
+      if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
+        // allocate device memory
+        return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
+      } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
+        // allocate pinned host memory
+        return alpaka::allocMappedBuf<std::byte, size_t>(device_, alpaka::getDev(queue), bytes);
+      } else {
+        // unsupported combination
+        static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                      "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
+                      "the host CPU.");
+      }
+    }
+
+    void allocateNewBlock(BlockDescriptor& block) {
+      try {
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
+      } catch (std::runtime_error const& e) {
+        // the allocation attempt failed: free all cached blocks on the device and retry
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes
+              << " bytes for queue " << block.queue->m_spQueueImpl.get()
+              << ", retrying after freeing cached allocations" << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+        // TODO implement a method that frees only up to block.bytes bytes
+        freeAllCached();
+
+        // throw an exception if it fails again
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
+      }
+
+      // create a new event associated to the "synchronisation device"
+      block.event = Event{block.device()};
+
+      {
+        std::scoped_lock lock(mutex_);
+        cachedBytes_.live += block.bytes;
+        cachedBytes_.requested += block.requested;
+        // TODO use std::move() ?
+        liveBlocks_[block.buffer->data()] = block;
+      }
+
+      if (debug_) {
+        std::ostringstream out;
+        out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
+            << block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
+            << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
+        std::cout << out.str() << std::endl;
+      }
+    }
+
+    void freeAllCached() {
+      std::scoped_lock lock(mutex_);
+
+      while (not cachedBlocks_.empty()) {
+        auto iBlock = cachedBlocks_.begin();
+        cachedBytes_.free -= iBlock->second.bytes;
+
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes
+              << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free
+              << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding."
+              << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+
+        cachedBlocks_.erase(iBlock);
+      }
+    }
+
+    // TODO replace with a tbb::concurrent_multimap ?
+    using CachedBlocks = std::multimap<unsigned int, BlockDescriptor>;  // ordered by the allocation bin
+    // TODO replace with a tbb::concurrent_map ?
+    using BusyBlocks = std::map<void*, BlockDescriptor>;  // ordered by the address of the allocated memory
+
+    inline static const std::string deviceType_ = alpaka::core::demangled<Device>;
+
+    mutable std::mutex mutex_;
+    Device device_;  // the device where the memory is allocated
+
+    CachedBytes cachedBytes_;
+    CachedBlocks cachedBlocks_;  // Set of cached device allocations available for reuse
+    BusyBlocks liveBlocks_;      // map of pointers to the live device allocations currently in use
+
+    const unsigned int binGrowth_;  // Geometric growth factor for bin-sizes
+    const unsigned int minBin_;
+    const unsigned int maxBin_;
+
+    const size_t minBinBytes_;
+    const size_t maxBinBytes_;
+    const size_t maxCachedBytes_;  // Maximum aggregate cached bytes per device
+
+    const bool reuseSameQueueAllocations_;
+    const bool debug_;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
diff --git a/code/alpaka_interface/HostOnlyTask.h b/code/alpaka_interface/HostOnlyTask.h
new file mode 100644
index 00000000..fc07921e
--- /dev/null
+++ b/code/alpaka_interface/HostOnlyTask.h
@@ -0,0 +1,71 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h
+#define HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h
+
+#include <functional>
+#include <memory>
+
+#include <alpaka/alpaka.hpp>
+
+namespace alpaka {
+
+  //! A task that is guaranted not to call any GPU-ralated APIs
+  //!
+  //! These tasks can be enqueued directly to the native GPU queues, without the use of a
+  //! dedicated host-side worker thread.
+  class HostOnlyTask {
+  public:
+    HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
+
+    void operator()() const { task_(); }
+
+  private:
+    std::function<void()> task_;
+  };
+
+  namespace trait {
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    //! The CUDA async queue enqueue trait specialization for "safe tasks"
+    template <>
+    struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
+      using TApi = ApiCudaRt;
+
+      static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
+        //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
+        std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
+        (*pTask)();
+      }
+
+      ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
+        auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+      }
+    };
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    //! The HIP async queue enqueue trait specialization for "safe tasks"
+    template <>
+    struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
+      using TApi = ApiHipRt;
+
+      static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
+        //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
+        std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
+        (*pTask)();
+      }
+
+      ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
+        auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+      }
+    };
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  }  // namespace trait
+
+}  // namespace alpaka
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_HostOnlyTask_h
diff --git a/code/alpaka_interface/ScopedContextFwd.h b/code/alpaka_interface/ScopedContextFwd.h
new file mode 100644
index 00000000..206824aa
--- /dev/null
+++ b/code/alpaka_interface/ScopedContextFwd.h
@@ -0,0 +1,35 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
+#define HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
+
+#include "traits.h"
+
+// Forward declaration of the alpaka framework Context classes
+//
+// This file is under HeterogeneousCore/AlpakaInterface to avoid introducing a dependency on
+// HeterogeneousCore/AlpakaCore.
+
+namespace cms::alpakatools {
+
+  namespace impl {
+    template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+    class ScopedContextBase;
+
+    template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+    class ScopedContextGetterBase;
+  }  // namespace impl
+
+  template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+  class ScopedContextAcquire;
+
+  template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+  class ScopedContextProduce;
+
+  template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+  class ScopedContextTask;
+
+  template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+  class ScopedContextAnalyze;
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
diff --git a/code/alpaka_interface/config.h b/code/alpaka_interface/config.h
new file mode 100644
index 00000000..354a93b9
--- /dev/null
+++ b/code/alpaka_interface/config.h
@@ -0,0 +1,164 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_config_h
+#define HeterogeneousCore_AlpakaInterface_interface_config_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "stringize.h"
+
+namespace alpaka_common {
+
+  // common types and dimensions
+  using Idx = uint32_t;
+  using Extent = uint32_t;
+  using Offsets = Extent;
+
+  using Dim0D = alpaka::DimInt<0u>;
+  using Dim1D = alpaka::DimInt<1u>;
+  using Dim2D = alpaka::DimInt<2u>;
+  using Dim3D = alpaka::DimInt<3u>;
+
+  template <typename TDim>
+  using Vec = alpaka::Vec<TDim, Idx>;
+  using Vec1D = Vec<Dim1D>;
+  using Vec2D = Vec<Dim2D>;
+  using Vec3D = Vec<Dim3D>;
+  using Scalar = Vec<Dim0D>;
+
+  template <typename TDim>
+  using WorkDiv = alpaka::WorkDivMembers<TDim, Idx>;
+  using WorkDiv1D = WorkDiv<Dim1D>;
+  using WorkDiv2D = WorkDiv<Dim2D>;
+  using WorkDiv3D = WorkDiv<Dim3D>;
+
+  // host types
+  using DevHost = alpaka::DevCpu;
+  using PltfHost = alpaka::Pltf<DevHost>;
+
+}  // namespace alpaka_common
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+namespace alpaka_cuda_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCudaRt;
+  using Device = alpaka::DevCudaRt;
+  using Queue = alpaka::QueueCudaRtNonBlocking;
+  using Event = alpaka::EventCudaRt;
+
+  template <typename TDim>
+  using Acc = alpaka::AccGpuCudaRt<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_cuda_async
+
+#ifdef ALPAKA_ACCELERATOR_NAMESPACE
+#define ALPAKA_DUPLICATE_NAMESPACE
+#else
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
+#define ALPAKA_TYPE_SUFFIX CudaAsync
+#endif
+
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+namespace alpaka_hip_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfHipRt;
+  using Device = alpaka::DevHipRt;
+  using Queue = alpaka::QueueHipRtNonBlocking;
+  using Event = alpaka::EventHipRt;
+
+  template <typename TDim>
+  using Acc = alpaka::AccGpuHipRt<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_hip_async
+
+#ifdef ALPAKA_ACCELERATOR_NAMESPACE
+#define ALPAKA_DUPLICATE_NAMESPACE
+#else
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_hip_async
+#define ALPAKA_TYPE_SUFFIX HipAsync
+#endif
+
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+namespace alpaka_serial_sync {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCpu;
+  using Device = alpaka::DevCpu;
+  using Queue = alpaka::QueueCpuBlocking;
+  using Event = alpaka::EventCpu;
+
+  template <typename TDim>
+  using Acc = alpaka::AccCpuSerial<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_serial_sync
+
+#ifdef ALPAKA_ACCELERATOR_NAMESPACE
+#define ALPAKA_DUPLICATE_NAMESPACE
+#else
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync
+#define ALPAKA_TYPE_SUFFIX SerialSync
+#endif
+
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+namespace alpaka_tbb_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCpu;
+  using Device = alpaka::DevCpu;
+  using Queue = alpaka::QueueCpuNonBlocking;
+  using Event = alpaka::EventCpu;
+
+  template <typename TDim>
+  using Acc = alpaka::AccCpuTbbBlocks<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_tbb_async
+
+#ifdef ALPAKA_ACCELERATOR_NAMESPACE
+#define ALPAKA_DUPLICATE_NAMESPACE
+#else
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async
+#define ALPAKA_TYPE_SUFFIX TbbAsync
+#endif
+
+#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#if defined ALPAKA_DUPLICATE_NAMESPACE
+#error Only one alpaka backend symbol can be defined at the same time: ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_GPU_HIP_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED.
+#endif
+
+#if defined ALPAKA_ACCELERATOR_NAMESPACE
+
+// create a new backend-specific identifier based on the original type name and a backend-specific suffix
+#define ALPAKA_TYPE_ALIAS__(TYPE, SUFFIX) TYPE##SUFFIX
+#define ALPAKA_TYPE_ALIAS_(TYPE, SUFFIX) ALPAKA_TYPE_ALIAS__(TYPE, SUFFIX)
+#define ALPAKA_TYPE_ALIAS(TYPE) ALPAKA_TYPE_ALIAS_(TYPE, ALPAKA_TYPE_SUFFIX)
+
+// declare the backend-specific identifier as an alias for the namespace-based type name
+#define DECLARE_ALPAKA_TYPE_ALIAS(TYPE) using ALPAKA_TYPE_ALIAS(TYPE) = ALPAKA_ACCELERATOR_NAMESPACE::TYPE
+
+// define a null-terminated string containing the backend-specific identifier
+#define ALPAKA_TYPE_ALIAS_NAME(TYPE) EDM_STRINGIZE(ALPAKA_TYPE_ALIAS(TYPE))
+
+#endif  // ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_config_h
diff --git a/code/alpaka_interface/devices.h b/code/alpaka_interface/devices.h
new file mode 100644
index 00000000..24630ece
--- /dev/null
+++ b/code/alpaka_interface/devices.h
@@ -0,0 +1,43 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_devices_h
+#define HeterogeneousCore_AlpakaInterface_interface_devices_h
+
+#include <cassert>
+#include <vector>
+
+#include <alpaka/alpaka.hpp>
+
+#include "config.h"
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+    inline std::vector<alpaka::Dev<TPlatform>> enumerate_devices() {
+      using Platform = TPlatform;
+      using Device = alpaka::Dev<Platform>;
+
+      std::vector<Device> devices;
+      uint32_t n = alpaka::getDevCount<Platform>();
+      devices.reserve(n);
+      for (uint32_t i = 0; i < n; ++i) {
+        devices.push_back(alpaka::getDevByIdx<Platform>(i));
+        assert(alpaka::getNativeHandle(devices.back()) == static_cast<int>(i));
+      }
+
+      return devices;
+    }
+
+  }  // namespace detail
+
+  // return the alpaka accelerator devices for the given platform
+  template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+  inline std::vector<alpaka::Dev<TPlatform>> const& devices() {
+    static const auto devices = detail::enumerate_devices<TPlatform>();
+    return devices;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_devices_h
diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h
new file mode 100644
index 00000000..94e0e7cc
--- /dev/null
+++ b/code/alpaka_interface/getDeviceCachingAllocator.h
@@ -0,0 +1,88 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
+#define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
+
+#include <memory>
+
+#include "thread_safety_macros.h"
+#include "AllocatorConfig.h"
+#include "CachingAllocator.h"
+#include "devices.h"
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    template <typename TDev,
+              typename TQueue,
+              typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev> and cms::alpakatools::is_queue_v<TQueue>>>
+    auto allocate_device_allocators() {
+      using Allocator = CachingAllocator<TDev, TQueue>;
+      auto const& devices = cms::alpakatools::devices<alpaka::Pltf<TDev>>();
+      ssize_t const size = devices.size();
+
+      // allocate the storage for the objects
+      auto ptr = std::allocator<Allocator>().allocate(size);
+
+      // construct the objects in the storage
+      ptrdiff_t index = 0;
+      try {
+        for (; index < size; ++index) {
+#if __cplusplus >= 202002L
+          std::construct_at(
+#else
+          std::allocator<Allocator>().construct(
+#endif
+              ptr + index,
+              devices[index],
+              config::binGrowth,
+              config::minBin,
+              config::maxBin,
+              config::maxCachedBytes,
+              config::maxCachedFraction,
+              true,    // reuseSameQueueAllocations
+              false);  // debug
+        }
+      } catch (...) {
+        --index;
+        // destroy any object that had been succesfully constructed
+        while (index >= 0) {
+          std::destroy_at(ptr + index);
+          --index;
+        }
+        // deallocate the storage
+        std::allocator<Allocator>().deallocate(ptr, size);
+        // rethrow the exception
+        throw;
+      }
+
+      // use a custom deleter to destroy all objects and deallocate the memory
+      auto deleter = [size](Allocator* ptr) {
+        for (size_t i = size; i > 0; --i) {
+          std::destroy_at(ptr + i - 1);
+        }
+        std::allocator<Allocator>().deallocate(ptr, size);
+      };
+
+      return std::unique_ptr<Allocator[], decltype(deleter)>(ptr, deleter);
+    }
+
+  }  // namespace detail
+
+  template <typename TDev,
+            typename TQueue,
+            typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev> and cms::alpakatools::is_queue_v<TQueue>>>
+  inline CachingAllocator<TDev, TQueue>& getDeviceCachingAllocator(TDev const& device) {
+    // initialise all allocators, one per device
+    CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators<TDev, TQueue>();
+
+    size_t const index = alpaka::getNativeHandle(device);
+    assert(index < cms::alpakatools::devices<alpaka::Pltf<TDev>>().size());
+
+    // the public interface is thread safe
+    return allocators[index];
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
diff --git a/code/alpaka_interface/getHostCachingAllocator.h b/code/alpaka_interface/getHostCachingAllocator.h
new file mode 100644
index 00000000..2ffa1871
--- /dev/null
+++ b/code/alpaka_interface/getHostCachingAllocator.h
@@ -0,0 +1,32 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
+#define HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
+
+#include "thread_safety_macros.h"
+#include "AllocatorConfig.h"
+#include "CachingAllocator.h"
+#include "config.h"
+#include "host.h"
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  template <typename TQueue, typename = std::enable_if_t<cms::alpakatools::is_queue_v<TQueue>>>
+  inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator() {
+    // thread safe initialisation of the host allocator
+    CMS_THREAD_SAFE static CachingAllocator<alpaka_common::DevHost, TQueue> allocator(
+        host(),
+        config::binGrowth,
+        config::minBin,
+        config::maxBin,
+        config::maxCachedBytes,
+        config::maxCachedFraction,
+        false,   // reuseSameQueueAllocations
+        false);  // debug
+
+    // the public interface is thread safe
+    return allocator;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
diff --git a/code/alpaka_interface/host.h b/code/alpaka_interface/host.h
new file mode 100644
index 00000000..0303313d
--- /dev/null
+++ b/code/alpaka_interface/host.h
@@ -0,0 +1,29 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
+#define HeterogeneousCore_AlpakaInterface_interface_host_h
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    inline alpaka::DevCpu enumerate_host() {
+      using Platform = alpaka::PltfCpu;
+      using Host = alpaka::DevCpu;
+
+      assert(alpaka::getDevCount<Platform>() == 1);
+      Host host = alpaka::getDevByIdx<Platform>(0);
+      assert(alpaka::getNativeHandle(host) == 0);
+
+      return host;
+    }
+
+  }  // namespace detail
+
+  // returns the alpaka host device
+  static inline alpaka::DevCpu const& host() {
+    static const auto host = detail::enumerate_host();
+    return host;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_host_h
diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h
new file mode 100644
index 00000000..cbdc6fc0
--- /dev/null
+++ b/code/alpaka_interface/memory.h
@@ -0,0 +1,247 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_memory_h
+#define HeterogeneousCore_AlpakaInterface_interface_memory_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "AllocatorPolicy.h"
+#include "CachedBufAlloc.h"
+#include "config.h"
+#include "traits.h"
+
+namespace cms::alpakatools {
+
+  // for Extent, Dim1D, Idx
+  using namespace alpaka_common;
+
+  // type deduction helpers
+  namespace detail {
+
+    template <typename TDev, typename T, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+    struct buffer_type {
+      using type = alpaka::Buf<TDev, T, Dim0D, Idx>;
+    };
+
+    template <typename TDev, typename T>
+    struct buffer_type<TDev, T[]> {
+      using type = alpaka::Buf<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T, int N>
+    struct buffer_type<TDev, T[N]> {
+      using type = alpaka::Buf<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+    struct view_type {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim0D, Idx>;
+    };
+
+    template <typename TDev, typename T>
+    struct view_type<TDev, T[]> {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T, int N>
+    struct view_type<TDev, T[N]> {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>;
+    };
+
+  }  // namespace detail
+
+  // scalar and 1-dimensional host buffers
+
+  template <typename T>
+  using host_buffer = typename detail::buffer_type<DevHost, T>::type;
+
+  template <typename T>
+  using const_host_buffer = alpaka::ViewConst<host_buffer<T>>;
+
+  // non-cached, non-pinned, scalar and 1-dimensional host buffers
+
+  template <typename T>
+  std::enable_if_t<not std::is_array_v<T>, host_buffer<T>> make_host_buffer() {
+    return alpaka::allocBuf<T, Idx>(host(), Scalar{});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer(Extent extent) {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host(), Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer() {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host(), Vec1D{std::extent_v<T>});
+  }
+
+  // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue
+  // the memory is pinned according to the device associated to the queue
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and not std::is_array_v<T>, host_buffer<T>> make_host_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<T, Idx>(host(), queue, Scalar{});
+    } else {
+      return alpaka::allocMappedBuf<T, Idx>(host(), alpaka::getDev(queue), Scalar{});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   host_buffer<T>>
+  make_host_buffer(TQueue const& queue, Extent extent) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(host(), queue, Vec1D{extent});
+    } else {
+      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(host(), alpaka::getDev(queue), Vec1D{extent});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   host_buffer<T>>
+  make_host_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(host(), queue, Vec1D{std::extent_v<T>});
+    } else {
+      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(
+          host(), alpaka::getDev(queue), Vec1D{std::extent_v<T>});
+    }
+  }
+
+  // scalar and 1-dimensional host views
+
+  template <typename T>
+  using host_view = typename detail::view_type<DevHost, T>::type;
+
+  template <typename T>
+  std::enable_if_t<not std::is_array_v<T>, host_view<T>> make_host_view(T& data) {
+    return alpaka::ViewPlainPtr<DevHost, T, Dim0D, Idx>(&data, host(), Scalar{});
+  }
+
+  template <typename T>
+  host_view<T[]> make_host_view(T* data, Extent extent) {
+    return alpaka::ViewPlainPtr<DevHost, T, Dim1D, Idx>(data, host(), Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  make_host_view(T& data, Extent extent) {
+    return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host(), Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  make_host_view(T& data) {
+    return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host(), Vec1D{std::extent_v<T>});
+  }
+
+  // scalar and 1-dimensional device buffers
+
+  template <typename TDev, typename T, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+  using device_buffer = typename detail::buffer_type<TDev, T>::type;
+
+  template <typename TDev, typename T, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+  using const_device_buffer = alpaka::ViewConst<device_buffer<TDev, T>>;
+
+  // non-cached, scalar and 1-dimensional device buffers
+
+  template <typename T, typename TDev>
+  std::enable_if_t<is_device_v<TDev> and not std::is_array_v<T>, device_buffer<TDev, T>> make_device_buffer(
+      TDev const& device) {
+    return alpaka::allocBuf<T, Idx>(device, Scalar{});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<is_device_v<TDev> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<TDev, T>>
+  make_device_buffer(TDev const& device, Extent extent) {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(device, Vec1D{extent});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<is_device_v<TDev> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<TDev, T>>
+  make_device_buffer(TDev const& device) {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(device, Vec1D{std::extent_v<T>});
+  }
+
+  // potentially-cached, scalar and 1-dimensional device buffers with queue-ordered semantic
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and not std::is_array_v<T>, device_buffer<alpaka::Dev<TQueue>, T>>
+  make_device_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<T, Idx>(alpaka::getDev(queue), queue, Scalar{});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<T, Idx>(queue, Scalar{});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<T, Idx>(alpaka::getDev(queue), Scalar{});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<alpaka::Dev<TQueue>, T>>
+  make_device_buffer(TQueue const& queue, Extent extent) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), queue, Vec1D{extent});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<std::remove_extent_t<T>, Idx>(queue, Vec1D{extent});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), Vec1D{extent});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<is_queue_v<TQueue> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<alpaka::Dev<TQueue>, T>>
+  make_device_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), queue, Vec1D{std::extent_v<T>});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<std::remove_extent_t<T>, Idx>(queue, Vec1D{std::extent_v<T>});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), Vec1D{std::extent_v<T>});
+    }
+  }
+
+  // scalar and 1-dimensional device views
+
+  template <typename TDev, typename T, typename = std::enable_if_t<cms::alpakatools::is_device_v<TDev>>>
+  using device_view = typename detail::view_type<TDev, T>::type;
+
+  template <typename T, typename TDev>
+  std::enable_if_t<not std::is_array_v<T>, device_view<TDev, T>> make_device_view(TDev const& device, T& data) {
+    return alpaka::ViewPlainPtr<TDev, T, Dim0D, Idx>(&data, device, Scalar{});
+  }
+
+  template <typename T, typename TDev>
+  device_view<TDev, T[]> make_device_view(TDev const& device, T* data, Extent extent) {
+    return alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>(data, device, Vec1D{extent});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  make_device_view(TDev const& device, T& data, Extent extent) {
+    return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{extent});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  make_device_view(TDev const& device, T& data) {
+    return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{std::extent_v<T>});
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_memory_h
diff --git a/code/alpaka_interface/stringize.h b/code/alpaka_interface/stringize.h
new file mode 100644
index 00000000..549d5cbc
--- /dev/null
+++ b/code/alpaka_interface/stringize.h
@@ -0,0 +1,8 @@
+#ifndef FWCore_Utilities_interface_stringize_h
+#define FWCore_Utilities_interface_stringize_h
+
+// convert the macro argument to a null-terminated quoted string
+#define EDM_STRINGIZE_(token) #token
+#define EDM_STRINGIZE(token) EDM_STRINGIZE_(token)
+
+#endif  // FWCore_Utilities_interface_stringize_h
diff --git a/code/alpaka_interface/thread_safety_macros.h b/code/alpaka_interface/thread_safety_macros.h
new file mode 100644
index 00000000..3abbe0b9
--- /dev/null
+++ b/code/alpaka_interface/thread_safety_macros.h
@@ -0,0 +1,12 @@
+#ifndef FWCore_Utilites_thread_safe_macros_h
+#define FWCore_Utilites_thread_safe_macros_h
+#if !defined __CLING__ && !defined __INTEL_COMPILER && !defined __NVCC__
+#define CMS_THREAD_SAFE [[cms::thread_safe]]
+#define CMS_SA_ALLOW [[cms::sa_allow]]
+#define CMS_THREAD_GUARD(_var_) [[cms::thread_guard(#_var_)]]
+#else
+#define CMS_THREAD_SAFE
+#define CMS_SA_ALLOW
+#define CMS_THREAD_GUARD(_var_)
+#endif
+#endif
diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h
new file mode 100644
index 00000000..8235a416
--- /dev/null
+++ b/code/alpaka_interface/traits.h
@@ -0,0 +1,69 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_traits_h
+#define HeterogeneousCore_AlpakaInterface_interface_traits_h
+
+#include <type_traits>
+
+#if __cplusplus >= 202002L
+namespace cms {
+  using std::is_bounded_array;
+  using std::is_unbounded_array;
+}  // namespace cms
+#else
+#include <boost/type_traits/is_bounded_array.hpp>
+#include <boost/type_traits/is_unbounded_array.hpp>
+namespace cms {
+  using boost::is_bounded_array;
+  using boost::is_unbounded_array;
+}  // namespace cms
+#endif
+
+namespace cms {
+  template <typename T>
+  inline constexpr bool is_bounded_array_v = is_bounded_array<T>::value;
+
+  template <typename T>
+  inline constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
+}  // namespace cms
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  // is_platform
+
+  template <typename T>
+  struct is_platform
+      : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptPltf, T>::value> {};
+
+  template <typename T>
+  constexpr bool is_platform_v = is_platform<T>::value;
+
+  // is_device
+
+  template <typename T>
+  struct is_device : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptDev, T>::value> {};
+
+  template <typename T>
+  constexpr bool is_device_v = is_device<T>::value;
+
+  // is_accelerator
+
+  template <typename T>
+  struct is_accelerator
+      : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptAcc, T>::value> {};
+
+  template <typename T>
+  constexpr bool is_accelerator_v = is_accelerator<T>::value;
+
+  // is_queue
+
+  template <typename T>
+  struct is_queue : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptQueue, T>::value> {
+  };
+
+  template <typename T>
+  constexpr bool is_queue_v = is_queue<T>::value;
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_traits_h

From 8105c8dd3f1f7dbdfa8bf9ec449d96c3668df84f Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 19:12:10 -0700
Subject: [PATCH 34/44] bring back caching allocator toggle

---
 SDL/Constants.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 68981441..d3b698cd 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -106,7 +106,11 @@ using Buf = alpaka::Buf<TAcc, TData, Dim1d, Idx>;
 
 template<typename T, typename TAcc, typename TSize, typename TQueue>
 ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf<TAcc, T> allocBufWrapper(TAcc const & devAccIn, TSize nElements, TQueue queue) {
+#ifdef CACHE_ALLOC
     return cms::alpakatools::allocCachedBuf<T, Idx>(devAccIn, queue, Vec1d(static_cast<Idx>(nElements)));
+#else
+    return alpaka::allocBuf<T, Idx>(devAccIn, Vec1d(static_cast<Idx>(nElements)));
+#endif
 }
 
 template<typename T, typename TAcc, typename TSize>

From ad2048bca755a528ba9fe36126a062f15a79eb2c Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 20:17:57 -0700
Subject: [PATCH 35/44] full alpaka caching allocator

---
 SDL/Event.cu           | 104 ++++++++++++++++++++---------------------
 SDL/Hit.cuh            |  38 +++++++--------
 SDL/MiniDoublet.cuh    |  70 +++++++++++++--------------
 SDL/Module.cuh         |  50 ++++++++++----------
 SDL/PixelTriplet.cuh   |  82 ++++++++++++++++----------------
 SDL/Quintuplet.cuh     |  48 +++++++++----------
 SDL/Segment.cuh        |  68 +++++++++++++--------------
 SDL/TrackCandidate.cuh |  28 +++++------
 SDL/Triplet.cuh        |  56 +++++++++++-----------
 setup_cgpu.sh          |   4 +-
 10 files changed, 274 insertions(+), 274 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index e20c9a01..36e09d15 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -172,7 +172,7 @@ void SDL::Event::addHitToEvent(std::vector<float> x, std::vector<float> y, std::
     const int nHits = x.size();
 
     // Needed for the memcpy to hitsInGPU below. Will be replaced with a View.
-    auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+    auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
     *alpaka::getPtrNative(nHits_buf) = nHits;
 
     // Initialize space on device/host for next event.
@@ -390,16 +390,16 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
 
 void SDL::Event::addMiniDoubletsToEventExplicit()
 {
-    auto nMDsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto nMDsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules);
 
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
 
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
 
-    auto module_hitRanges_buf = allocBufWrapper<int>(devHost, nLowerModules*2);
+    auto module_hitRanges_buf = allocBufWrapper<int>(devHost, nLowerModules*2, queue);
     alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2);
 
     alpaka::wait(queue);
@@ -427,13 +427,13 @@ void SDL::Event::addMiniDoubletsToEventExplicit()
 
 void SDL::Event::addSegmentsToEventExplicit()
 {
-    auto nSegmentsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto nSegmentsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules);
 
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
 
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
 
     alpaka::wait(queue);
@@ -484,7 +484,7 @@ void SDL::Event::createMiniDoublets()
     alpaka::enqueue(queue, createMDArrayRangesGPUTask);
     alpaka::wait(queue);
 
-    auto nTotalMDs_buf = allocBufWrapper<unsigned int>(devHost, 1);
+    auto nTotalMDs_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf, 1);
     alpaka::wait(queue);
@@ -602,7 +602,7 @@ void SDL::Event::createTriplets()
         alpaka::wait(queue);
 
         // TODO: Why are we pulling this back down only to put it back on the device in a new struct?
-        auto maxTriplets_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto maxTriplets_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
 
         alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf, 1);
         alpaka::wait(queue);
@@ -619,21 +619,21 @@ void SDL::Event::createTriplets()
     unsigned int max_InnerSeg = 0;
 
     // Allocate host index
-    auto index_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules);
+    auto index_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules, queue);
     uint16_t *index = alpaka::getPtrNative(index_buf);
 
     // Allocate device index
     auto index_gpu_buf = allocBufWrapper<uint16_t>(devAcc, nLowerModules, queue);
 
     // Allocate and copy nSegments from device to host
-    auto nSegments_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto nSegments_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules);
     alpaka::wait(queue);
 
     int *nSegments = alpaka::getPtrNative(nSegments_buf);
 
     // Allocate and copy module_nConnectedModules from device to host
-    auto module_nConnectedModules_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules);
+    auto module_nConnectedModules_buf = allocBufWrapper<uint16_t>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers->nConnectedModules_buf, nLowerModules);
     alpaka::wait(queue);
 
@@ -704,7 +704,7 @@ void SDL::Event::createTrackCandidates()
     }
 
     // Pull nEligibleT5Modules from the device.
-    auto nEligibleModules_buf = allocBufWrapper<uint16_t>(devHost, 1);
+    auto nEligibleModules_buf = allocBufWrapper<uint16_t>(devHost, 1, queue);
     alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1);
     uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf);
 
@@ -850,15 +850,15 @@ void SDL::Event::createPixelTriplets()
     alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments);
     alpaka::wait(queue);
 
-    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue);
+    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue);
 
     alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     alpaka::wait(queue);
 
-    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
-    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments, queue);
+    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments, queue);
     auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
     auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
 
@@ -931,7 +931,7 @@ void SDL::Event::createPixelTriplets()
     alpaka::wait(queue);
 
 #ifdef Warnings
-    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
     alpaka::wait(queue);
@@ -973,8 +973,8 @@ void SDL::Event::createQuintuplets()
     alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask);
     alpaka::wait(queue);
 
-    auto nEligibleT5Modules_buf = allocBufWrapper<uint16_t>(devHost, 1);
-    auto nTotalQuintuplets_buf = allocBufWrapper<unsigned int>(devHost, 1);
+    auto nEligibleT5Modules_buf = allocBufWrapper<uint16_t>(devHost, 1, queue);
+    auto nTotalQuintuplets_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf, 1);
     alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf, 1);
@@ -1088,15 +1088,15 @@ void SDL::Event::createPixelQuintuplets()
     alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments);
     alpaka::wait(queue);
 
-    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
-    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+    auto superbins_buf = allocBufWrapper<int>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue);
+    auto pixelTypes_buf = allocBufWrapper<int8_t>(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue);
 
     alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
     alpaka::wait(queue);
 
-    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
-    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments);
+    auto connectedPixelSize_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments, queue);
+    auto connectedPixelIndex_host_buf = allocBufWrapper<unsigned int>(devHost, nInnerSegments, queue);
     auto connectedPixelSize_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
     auto connectedPixelIndex_dev_buf = allocBufWrapper<unsigned int>(devAcc, nInnerSegments, queue);
 
@@ -1197,7 +1197,7 @@ void SDL::Event::createPixelQuintuplets()
     alpaka::wait(queue);
 
 #ifdef Warnings
-    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
     alpaka::wait(queue);
@@ -1208,16 +1208,16 @@ void SDL::Event::createPixelQuintuplets()
 
 void SDL::Event::addQuintupletsToEventExplicit()
 {
-    auto nQuintupletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto nQuintupletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf, nLowerModules);
 
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nModules);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nModules, queue);
     alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nModules);
 
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
 
-    auto module_quintupletModuleIndices_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto module_quintupletModuleIndices_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf, nLowerModules);
 
     alpaka::wait(queue);
@@ -1245,13 +1245,13 @@ void SDL::Event::addQuintupletsToEventExplicit()
 
 void SDL::Event::addTripletsToEventExplicit()
 {
-    auto nTripletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules);
+    auto nTripletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf, nLowerModules);
 
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
 
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules);
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
     alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
 
     alpaka::wait(queue);
@@ -1409,7 +1409,7 @@ unsigned int SDL::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer)
 
 int SDL::Event::getNumberOfPixelTriplets()
 {
-    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+    auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
     alpaka::wait(queue);
@@ -1421,7 +1421,7 @@ int SDL::Event::getNumberOfPixelTriplets()
 
 int SDL::Event::getNumberOfPixelQuintuplets()
 {
-    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+    auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
     alpaka::wait(queue);
@@ -1466,7 +1466,7 @@ unsigned int SDL::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer)
 
 int SDL::Event::getNumberOfTrackCandidates()
 {
-    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
     alpaka::wait(queue);
@@ -1478,7 +1478,7 @@ int SDL::Event::getNumberOfTrackCandidates()
 
 int SDL::Event::getNumberOfPT5TrackCandidates()
 {
-    auto nTrackCandidatesPT5_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidatesPT5_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf, 1);
     alpaka::wait(queue);
@@ -1490,7 +1490,7 @@ int SDL::Event::getNumberOfPT5TrackCandidates()
 
 int SDL::Event::getNumberOfPT3TrackCandidates()
 {
-    auto nTrackCandidatesPT3_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidatesPT3_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf, 1);
     alpaka::wait(queue);
@@ -1502,7 +1502,7 @@ int SDL::Event::getNumberOfPT3TrackCandidates()
 
 int SDL::Event::getNumberOfPLSTrackCandidates()
 {
-    auto nTrackCandidatesPLS_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidatesPLS_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf, 1);
     alpaka::wait(queue);
@@ -1514,8 +1514,8 @@ int SDL::Event::getNumberOfPLSTrackCandidates()
 
 int SDL::Event::getNumberOfPixelTrackCandidates()
 {
-    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1);
-    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidates_buf = allocBufWrapper<int>(devHost, 1, queue);
+    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
     alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1);
@@ -1529,7 +1529,7 @@ int SDL::Event::getNumberOfPixelTrackCandidates()
 
 int SDL::Event::getNumberOfT5TrackCandidates()
 {
-    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1);
+    auto nTrackCandidatesT5_buf = allocBufWrapper<int>(devHost, 1, queue);
 
     alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf, 1);
     alpaka::wait(queue);
@@ -1543,7 +1543,7 @@ SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHits() //std::shared_ptr should
 {
     if(hitsInCPU == nullptr)
     {
-        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1);
         alpaka::wait(queue);
 
@@ -1567,7 +1567,7 @@ SDL::hitsBuffer<alpaka::DevCpu>* SDL::Event::getHitsInCMSSW()
 {
     if(hitsInCPU == nullptr)
     {
-        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nHits_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf, 1);
         alpaka::wait(queue);
 
@@ -1604,7 +1604,7 @@ SDL::miniDoubletsBuffer<alpaka::DevCpu>* SDL::Event::getMiniDoublets()
     if(mdsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initialize host based mdsInCPU
-        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nMemLocal_buf, miniDoubletsBuffers->nMemoryLocations_buf, 1);
         alpaka::wait(queue);
 
@@ -1628,7 +1628,7 @@ SDL::segmentsBuffer<alpaka::DevCpu>* SDL::Event::getSegments()
     if(segmentsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based segmentsInCPU
-        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nMemLocal_buf, segmentsBuffers->nMemoryLocations_buf, 1);
         alpaka::wait(queue);
 
@@ -1659,7 +1659,7 @@ SDL::tripletsBuffer<alpaka::DevCpu>* SDL::Event::getTriplets()
     if(tripletsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based tripletsInCPU
-        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nMemLocal_buf, tripletsBuffers->nMemoryLocations_buf, 1);
         alpaka::wait(queue);
 
@@ -1700,7 +1700,7 @@ SDL::quintupletsBuffer<alpaka::DevCpu>* SDL::Event::getQuintuplets()
     if(quintupletsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
-        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1);
+        auto nMemLocal_buf = allocBufWrapper<unsigned int>(devHost, 1, queue);
         alpaka::memcpy(queue, nMemLocal_buf, quintupletsBuffers->nMemoryLocations_buf, 1);
         alpaka::wait(queue);
 
@@ -1733,7 +1733,7 @@ SDL::pixelTripletsBuffer<alpaka::DevCpu>* SDL::Event::getPixelTriplets()
     if(pixelTripletsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
-        auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1);
+        auto nPixelTriplets_buf = allocBufWrapper<int>(devHost, 1, queue);
         alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf, 1);
         alpaka::wait(queue);
 
@@ -1764,7 +1764,7 @@ SDL::pixelQuintupletsBuffer<alpaka::DevCpu>* SDL::Event::getPixelQuintuplets()
     if(pixelQuintupletsInCPU == nullptr)
     {
         // Get nMemoryLocations parameter to initilize host based quintupletsInCPU
-        auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1);
+        auto nPixelQuintuplets_buf = allocBufWrapper<int>(devHost, 1, queue);
         alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf, 1);
         alpaka::wait(queue);
 
@@ -1791,7 +1791,7 @@ SDL::trackCandidatesBuffer<alpaka::DevCpu>* SDL::Event::getTrackCandidates()
     if(trackCandidatesInCPU == nullptr)
     {
         // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU
-        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1);
+        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1, queue);
         alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
         alpaka::wait(queue);
 
@@ -1816,7 +1816,7 @@ SDL::trackCandidatesBuffer<alpaka::DevCpu>* SDL::Event::getTrackCandidatesInCMSS
     if(trackCandidatesInCPU == nullptr)
     {
         // Get nTrackLocal parameter to initialize host based trackCandidatesInCPU
-        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1);
+        auto nTrackLocal_buf = allocBufWrapper<int>(devHost, 1, queue);
         alpaka::memcpy(queue, nTrackLocal_buf, trackCandidatesBuffers->nTrackCandidates_buf, 1);
         alpaka::wait(queue);
 
diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index 8e69bc96..df348127 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -81,25 +81,25 @@ namespace SDL
                    unsigned int nMaxHits,
                    TDevAcc const & devAccIn,
                    TQueue& queue) :
-            nHits_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            xs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            ys_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            zs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMaxHits)),
-            idxs_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits)),
-            detid_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits)),
-            rts_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            phis_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            etas_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            highEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            highEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            lowEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            lowEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits)),
-            hitRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2)),
-            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nModules)),
-            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nModules)),
-            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules)),
-            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules))
+            nHits_buf(allocBufWrapper<unsigned int>(devAccIn, 1u, queue)),
+            xs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            ys_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            zs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMaxHits, queue)),
+            idxs_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits, queue)),
+            detid_buf(allocBufWrapper<unsigned int>(devAccIn, nMaxHits, queue)),
+            rts_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            phis_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            etas_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            highEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            highEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            lowEdgeXs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            lowEdgeYs_buf(allocBufWrapper<float>(devAccIn, nMaxHits, queue)),
+            hitRanges_buf(allocBufWrapper<int>(devAccIn, nModules*2, queue)),
+            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nModules, queue)),
+            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nModules, queue)),
+            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nModules, queue)),
+            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nModules, queue))
         {
             alpaka::memset(queue, hitRanges_buf, -1, nModules*2);
             alpaka::memset(queue, hitRangesLower_buf, -1, nModules);
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 74897eb3..1574a662 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -141,41 +141,41 @@ namespace SDL
                            uint16_t nLowerModules,
                            TDevAcc const & devAccIn,
                            TQueue& queue) :
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            anchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc)),
-            outerHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc)),
-            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLoc)),
-            nMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            totOccupancyMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1)),
-            dphichanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            dzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            dphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            shiftedXs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            shiftedYs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            shiftedZs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            noShiftedDzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            noShiftedDphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            noShiftedDphiChanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            anchorLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc)),
-            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc))
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            anchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc, queue)),
+            outerHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLoc, queue)),
+            moduleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLoc, queue)),
+            nMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1, queue)),
+            totOccupancyMDs_buf(allocBufWrapper<int>(devAccIn, nLowerModules+1, queue)),
+            dphichanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            dzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            dphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            shiftedXs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            shiftedYs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            shiftedZs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            noShiftedDzs_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            noShiftedDphis_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            noShiftedDphiChanges_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            anchorLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerZ_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerRt_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerPhi_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerEta_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerHighEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerHighEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerLowEdgeX_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue)),
+            outerLowEdgeY_buf(allocBufWrapper<float>(devAccIn, nMemoryLoc, queue))
         {
             alpaka::memset(queue, nMDs_buf, 0, nLowerModules+1);
             alpaka::memset(queue, totOccupancyMDs_buf, 0, nLowerModules+1);
diff --git a/SDL/Module.cuh b/SDL/Module.cuh
index 1855aee0..d4d38af7 100644
--- a/SDL/Module.cuh
+++ b/SDL/Module.cuh
@@ -150,31 +150,31 @@ namespace SDL
                            unsigned int nLowerMod,
                            TDevAcc const & devAccIn,
                            TQueue& queue) :
-            hitRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nMod)),
-            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nMod)),
-            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nMod)),
-            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nMod)),
-            mdRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            segmentRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            trackletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            tripletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            trackCandidateRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            quintupletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2)),
-            nEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, 1)),
-            indicesOfEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, nLowerMod)),
-            quintupletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
-            quintupletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
-            miniDoubletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
-            miniDoubletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
-            segmentModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
-            segmentModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1)),
-            tripletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
-            tripletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod)),
-            device_nTotalMDs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            device_nTotalSegs_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            device_nTotalTrips_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            device_nTotalQuints_buf(allocBufWrapper<unsigned int>(devAccIn, 1))
+            hitRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            hitRangesLower_buf(allocBufWrapper<int>(devAccIn, nMod, queue)),
+            hitRangesUpper_buf(allocBufWrapper<int>(devAccIn, nMod, queue)),
+            hitRangesnLower_buf(allocBufWrapper<int8_t>(devAccIn, nMod, queue)),
+            hitRangesnUpper_buf(allocBufWrapper<int8_t>(devAccIn, nMod, queue)),
+            mdRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            segmentRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            trackletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            tripletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            trackCandidateRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            quintupletRanges_buf(allocBufWrapper<int>(devAccIn, nMod*2, queue)),
+            nEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, 1, queue)),
+            indicesOfEligibleT5Modules_buf(allocBufWrapper<uint16_t>(devAccIn, nLowerMod, queue)),
+            quintupletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod, queue)),
+            quintupletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod, queue)),
+            miniDoubletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1, queue)),
+            miniDoubletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1, queue)),
+            segmentModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1, queue)),
+            segmentModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod+1, queue)),
+            tripletModuleIndices_buf(allocBufWrapper<int>(devAccIn, nLowerMod, queue)),
+            tripletModuleOccupancy_buf(allocBufWrapper<int>(devAccIn, nLowerMod, queue)),
+            device_nTotalMDs_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            device_nTotalSegs_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            device_nTotalTrips_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            device_nTotalQuints_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue))
         {
             alpaka::memset(queue, hitRanges_buf, -1, nMod*2);
             alpaka::memset(queue, hitRangesLower_buf, -1, nMod);
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index 0cf22db4..a8e76c2e 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -101,29 +101,29 @@ namespace SDL
         pixelTripletsBuffer(unsigned int maxPixelTriplets,
                             TDevAcc const & devAccIn,
                             TQueue& queue) :
-            pixelSegmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets)),
-            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets)),
-            nPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1)),
-            totOccupancyPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1)),
-            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            tripletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            pt_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            eta_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            phi_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets)),
-            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelTriplets*5)),
-            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets*10)),
-            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelTriplets*5)),
-            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets)),
-            pixelRadiusError_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
-            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
-            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets)),
-            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets))
+            pixelSegmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets, queue)),
+            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets, queue)),
+            nPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            totOccupancyPixelTriplets_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            tripletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            pt_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            eta_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            phi_pix_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets, queue)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelTriplets, queue)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelTriplets*5, queue)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelTriplets*10, queue)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelTriplets*5, queue)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelTriplets, queue)),
+            pixelRadiusError_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets, queue)),
+            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets, queue)),
+            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets, queue)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelTriplets, queue))
         {
             alpaka::memset(queue, nPixelTriplets_buf, 0, 1);
             alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0, 1);
@@ -1511,24 +1511,24 @@ namespace SDL
         pixelQuintupletsBuffer(unsigned int maxPixelQuintuplets,
                                TDevAcc const & devAccIn,
                                TQueue& queue) :
-            pixelIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets)),
-            T5Indices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets)),
-            nPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1)),
-            totOccupancyPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1)),
-            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelQuintuplets)),
-            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelQuintuplets*7)),
-            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets*14)),
-            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelQuintuplets*7)),
-            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            quintupletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets)),
-            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets)),
-            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets)),
-            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets))
+            pixelIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets, queue)),
+            T5Indices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets, queue)),
+            nPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            totOccupancyPixelQuintuplets_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelQuintuplets, queue)),
+            score_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxPixelQuintuplets*7, queue)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelQuintuplets*14, queue)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, maxPixelQuintuplets*7, queue)),
+            pixelRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            quintupletRadius_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxPixelQuintuplets, queue)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets, queue)),
+            rPhiChiSquared_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets, queue)),
+            rPhiChiSquaredInwards_buf(allocBufWrapper<float>(devAccIn, maxPixelQuintuplets, queue))
         {
             alpaka::memset(queue, nPixelQuintuplets_buf, 0, 1);
             alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0, 1);
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index 25588ed2..e29d8bde 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -107,30 +107,30 @@ namespace SDL
                           unsigned int nLowerModules,
                           TDevAcc const & devAccIn,
                           TQueue& queue) :
-            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * nTotalQuintuplets)),
-            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 5 * nTotalQuintuplets)),
-            nQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            totOccupancyQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            innerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            bridgeRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            outerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            pt_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            eta_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            phi_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            score_rphisum_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets)),
-            layer_buf(allocBufWrapper<uint8_t>(devAccIn, nTotalQuintuplets)),
-            isDup_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
-            TightCutFlag_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets)),
-            regressionRadius_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
-            regressionG_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
-            regressionF_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
-            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 5 * nTotalQuintuplets)),
-            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 10 * nTotalQuintuplets)),
-            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
-            chiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets)),
-            nonAnchorChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets))
+            tripletIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * nTotalQuintuplets, queue)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 5 * nTotalQuintuplets, queue)),
+            nQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules, queue)),
+            totOccupancyQuintuplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules, queue)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            innerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            bridgeRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            outerRadius_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            pt_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            eta_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            phi_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            score_rphisum_buf(allocBufWrapper<FPX>(devAccIn, nTotalQuintuplets, queue)),
+            layer_buf(allocBufWrapper<uint8_t>(devAccIn, nTotalQuintuplets, queue)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets, queue)),
+            TightCutFlag_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets, queue)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, nTotalQuintuplets, queue)),
+            regressionRadius_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue)),
+            regressionG_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue)),
+            regressionF_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 5 * nTotalQuintuplets, queue)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 10 * nTotalQuintuplets, queue)),
+            rzChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue)),
+            chiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue)),
+            nonAnchorChiSquared_buf(allocBufWrapper<float>(devAccIn, nTotalQuintuplets, queue))
         {
             alpaka::memset(queue, nQuintuplets_buf, 0, nLowerModules);
             alpaka::memset(queue, totOccupancyQuintuplets_buf, 0, nLowerModules);
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index c2f9aef5..ab8dd13d 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -130,40 +130,40 @@ namespace SDL
                         unsigned int maxPixelSegments,
                         TDevAcc const & devAccIn,
                         TQueue& queue) :
-            dPhis_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            dPhiMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            dPhiMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            dPhiChanges_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            dPhiChangeMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn)),
-            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
-            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn)),
-            seedIdx_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelSegments)),
-            mdIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn*2)),
-            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
-            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn)),
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            nSegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
-            totOccupancySegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1)),
-            charge_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
-            superbin_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments)),
-            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAccIn, maxPixelSegments)),
-            pixelType_buf(allocBufWrapper<int8_t>(devAccIn, maxPixelSegments)),
-            isQuad_buf(allocBufWrapper<char>(devAccIn, maxPixelSegments)),
-            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments)),
-            ptIn_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            ptErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            px_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            py_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            pz_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            etaErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            eta_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            phi_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            score_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            circleCenterX_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            circleCenterY_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments)),
-            circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments))
+            dPhis_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            dPhiMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            dPhiMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            dPhiChanges_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            dPhiChangeMins_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            dPhiChangeMaxs_buf(allocBufWrapper<FPX>(devAccIn, nMemoryLocationsIn, queue)),
+            innerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn, queue)),
+            outerLowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, nMemoryLocationsIn, queue)),
+            seedIdx_buf(allocBufWrapper<unsigned int>(devAccIn, maxPixelSegments, queue)),
+            mdIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn*2, queue)),
+            innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn, queue)),
+            outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, nMemoryLocationsIn, queue)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            nSegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1, queue)),
+            totOccupancySegments_buf(allocBufWrapper<int>(devAccIn, nLowerModules + 1, queue)),
+            charge_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments, queue)),
+            superbin_buf(allocBufWrapper<int>(devAccIn, maxPixelSegments, queue)),
+            pLSHitsIdxs_buf(allocBufWrapper<uint4>(devAccIn, maxPixelSegments, queue)),
+            pixelType_buf(allocBufWrapper<int8_t>(devAccIn, maxPixelSegments, queue)),
+            isQuad_buf(allocBufWrapper<char>(devAccIn, maxPixelSegments, queue)),
+            isDup_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments, queue)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxPixelSegments, queue)),
+            ptIn_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            ptErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            px_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            py_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            pz_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            etaErr_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            eta_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            phi_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            score_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            circleCenterX_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            circleCenterY_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue)),
+            circleRadius_buf(allocBufWrapper<float>(devAccIn, maxPixelSegments, queue))
         {
             alpaka::memset(queue, nSegments_buf, 0u, nLowerModules + 1);
             alpaka::memset(queue, totOccupancySegments_buf, 0u, nLowerModules + 1);
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index 9abaa754..9bdfd799 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -80,20 +80,20 @@ namespace SDL
         trackCandidatesBuffer(unsigned int maxTrackCandidates,
                             TDevAcc const & devAccIn,
                             TQueue& queue) :
-            trackCandidateType_buf(allocBufWrapper<short>(devAccIn, maxTrackCandidates)),
-            directObjectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTrackCandidates)),
-            objectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTrackCandidates)),
-            nTrackCandidates_buf(allocBufWrapper<int>(devAccIn, 1)),
-            nTrackCandidatespT3_buf(allocBufWrapper<int>(devAccIn, 1)),
-            nTrackCandidatespT5_buf(allocBufWrapper<int>(devAccIn, 1)),
-            nTrackCandidatespLS_buf(allocBufWrapper<int>(devAccIn, 1)),
-            nTrackCandidatesT5_buf(allocBufWrapper<int>(devAccIn, 1)),
-            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 7 * maxTrackCandidates)),
-            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 14 * maxTrackCandidates)),
-            pixelSeedIndex_buf(allocBufWrapper<int>(devAccIn, maxTrackCandidates)),
-            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 7 * maxTrackCandidates)),
-            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates)),
-            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates)),
+            trackCandidateType_buf(allocBufWrapper<short>(devAccIn, maxTrackCandidates, queue)),
+            directObjectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTrackCandidates, queue)),
+            objectIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTrackCandidates, queue)),
+            nTrackCandidates_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            nTrackCandidatespT3_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            nTrackCandidatespT5_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            nTrackCandidatespLS_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            nTrackCandidatesT5_buf(allocBufWrapper<int>(devAccIn, 1, queue)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, 7 * maxTrackCandidates, queue)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 14 * maxTrackCandidates, queue)),
+            pixelSeedIndex_buf(allocBufWrapper<int>(devAccIn, maxTrackCandidates, queue)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 7 * maxTrackCandidates, queue)),
+            centerX_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates, queue)),
+            centerY_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates, queue)),
             radius_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates))
         {
             alpaka::memset(queue, nTrackCandidates_buf, 0, 1);
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 045e2ee2..08c59d34 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -120,35 +120,35 @@ namespace SDL
                     unsigned int nLowerModules,
                     TDevAcc const & devAccIn,
                     TQueue& queue) :
-            segmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTriplets)),
-            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 3 * maxTriplets)),
-            nTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            totOccupancyTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules)),
-            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1)),
-            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxTriplets * 3)),
-            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTriplets * 6)),
-            betaIn_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
-            betaOut_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
-            pt_beta_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets)),
-            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets)),
-            partOfT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets)),
-            partOfPT3_buf(allocBufWrapper<bool>(devAccIn, maxTriplets))
+            segmentIndices_buf(allocBufWrapper<unsigned int>(devAccIn, 2 * maxTriplets, queue)),
+            lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 3 * maxTriplets, queue)),
+            nTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules, queue)),
+            totOccupancyTriplets_buf(allocBufWrapper<int>(devAccIn, nLowerModules, queue)),
+            nMemoryLocations_buf(allocBufWrapper<unsigned int>(devAccIn, 1, queue)),
+            logicalLayers_buf(allocBufWrapper<uint8_t>(devAccIn, maxTriplets * 3, queue)),
+            hitIndices_buf(allocBufWrapper<unsigned int>(devAccIn, maxTriplets * 6, queue)),
+            betaIn_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets, queue)),
+            betaOut_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets, queue)),
+            pt_beta_buf(allocBufWrapper<FPX>(devAccIn, maxTriplets, queue)),
+            partOfPT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets, queue)),
+            partOfT5_buf(allocBufWrapper<bool>(devAccIn, maxTriplets, queue)),
+            partOfPT3_buf(allocBufWrapper<bool>(devAccIn, maxTriplets, queue))
 #ifdef CUT_VALUE_DEBUG
-            ,zOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            rtOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            deltaPhiPos_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            deltaPhi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            zLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            zHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            zLoPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            zHiPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            sdlCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            betaInCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            betaOutCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            deltaBetaCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            rtLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            rtHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets)),
-            kZ_buf(allocBufWrapper<float>(devAccIn, maxTriplets))
+            ,zOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            rtOut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            deltaPhiPos_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            deltaPhi_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            zLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            zHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            zLoPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            zHiPointed_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            sdlCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            betaInCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            betaOutCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            deltaBetaCut_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            rtLo_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            rtHi_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue)),
+            kZ_buf(allocBufWrapper<float>(devAccIn, maxTriplets, queue))
 #endif
         {
             alpaka::memset(queue, nTriplets_buf, 0, nLowerModules);
diff --git a/setup_cgpu.sh b/setup_cgpu.sh
index fbff025e..a30c0bf8 100644
--- a/setup_cgpu.sh
+++ b/setup_cgpu.sh
@@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
 export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
 #eof

From 8d907e47958e0b3c83f86b1290a42b3b21156989 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Tue, 27 Jun 2023 20:34:01 -0700
Subject: [PATCH 36/44] cleanup

---
 SDL/TrackCandidate.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index 9bdfd799..250f4558 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -94,7 +94,7 @@ namespace SDL
             lowerModuleIndices_buf(allocBufWrapper<uint16_t>(devAccIn, 7 * maxTrackCandidates, queue)),
             centerX_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates, queue)),
             centerY_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates, queue)),
-            radius_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates))
+            radius_buf(allocBufWrapper<FPX>(devAccIn, maxTrackCandidates, queue))
         {
             alpaka::memset(queue, nTrackCandidates_buf, 0, 1);
             alpaka::memset(queue, nTrackCandidatesT5_buf, 0, 1);

From dd3b92a84e12c2f838bae57d3bf0d9417902671a Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 5 Jul 2023 12:10:47 -0400
Subject: [PATCH 37/44] setup for lnx7188

---
 README.md        |  6 ++++++
 setup_lnx7188.sh | 14 ++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a6b8d96c..8d6d967a 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,12 @@ For lnx7188 this needs to be done once
     cd /cdat/tem/${USER}/
     git clone git@github.com:SegmentLinking/LSTPerformanceWeb.git
 
+### Setting up container (only for lnx7188)
+
+For lnx7188 this needs to be done before compiling or running the code:
+
+    singularity shell --nv --bind /mnt/data1:/data --bind /data2/segmentlinking/ --bind /opt --bind /nfs --bind /mnt --bind /cvmfs  /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cmssw/el8:x86_64
+
 ### Running the code
 
     git clone --recursive git@github.com:SegmentLinking/TrackLooper.git
diff --git a/setup_lnx7188.sh b/setup_lnx7188.sh
index 73c42f60..a56a41c3 100644
--- a/setup_lnx7188.sh
+++ b/setup_lnx7188.sh
@@ -5,8 +5,11 @@
 ###########################################################################################################
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 source $DIR/code/rooutil/thisrooutil.sh
-export SCRAM_ARCH=slc7_amd64_gcc900
-export CMSSW_VERSION=CMSSW_11_2_0_pre5
+
+export SCRAM_ARCH=el8_amd64_gcc10
+export CMSSW_VERSION=CMSSW_12_5_0_pre5
+export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/
+
 source /cvmfs/cms.cern.ch/cmsset_default.sh
 cd /cvmfs/cms.cern.ch/$SCRAM_ARCH/cms/cmssw/$CMSSW_VERSION/src
 eval `scramv1 runtime -sh`
@@ -14,7 +17,6 @@ cd - > /dev/null
 echo "Setup following ROOT.  Make sure it's slc7 variant. Otherwise the looper won't compile."
 which root
 
-export CUDA_HOME=/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/cuda/11.0.3/
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export LD_LIBRARY_PATH=$DIR:$LD_LIBRARY_PATH
 export PATH=$DIR/bin:$PATH
@@ -34,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/cdat/tem/${USER}/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.5.0/etc/profile.d/init.sh
-export BOOST_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/boost/1.72.0-ghbfee3"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/slc7_amd64_gcc900/external/alpaka/0.7.0-09bef105568314b218f2a8410a876785"
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
+export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
 #eof

From aaf1e60127f60b51ea27002690eb14059e443fcd Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Wed, 5 Jul 2023 18:31:02 -0400
Subject: [PATCH 38/44] fix caching allocator bug

---
 SDL/Event.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 36e09d15..98fef91e 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -706,6 +706,7 @@ void SDL::Event::createTrackCandidates()
     // Pull nEligibleT5Modules from the device.
     auto nEligibleModules_buf = allocBufWrapper<uint16_t>(devHost, 1, queue);
     alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf, 1);
+    alpaka::wait(queue);
     uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf);
 
     Vec const threadsPerBlock_crossCleanpT3(static_cast<Idx>(1), static_cast<Idx>(16), static_cast<Idx>(64));

From 54c1103dd9ab2f6ed15895ba92f7b5a79c8355a9 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Mon, 10 Jul 2023 13:48:42 -0400
Subject: [PATCH 39/44] move to most recent cmssw alpaka interface + newer
 alpaka version

---
 code/alpaka_interface/CachingAllocator.h      |   4 +-
 code/alpaka_interface/TransferToHost.h        |  21 ++
 .../getDeviceCachingAllocator.h               |   1 +
 code/alpaka_interface/host.h                  |   4 +-
 code/alpaka_interface/memory.h                |  29 +-
 code/alpaka_interface/traits.h                |  19 +-
 code/alpaka_interface/vec.h                   |  42 +++
 code/alpaka_interface/workdivision.h          | 266 ++++++++++++++++++
 setup_cgpu.sh                                 |   6 +-
 setup_lnx7188.sh                              |   6 +-
 10 files changed, 374 insertions(+), 24 deletions(-)
 create mode 100644 code/alpaka_interface/TransferToHost.h
 create mode 100644 code/alpaka_interface/vec.h
 create mode 100644 code/alpaka_interface/workdivision.h

diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
index 72a52694..0a0dee82 100644
--- a/code/alpaka_interface/CachingAllocator.h
+++ b/code/alpaka_interface/CachingAllocator.h
@@ -336,8 +336,8 @@ namespace cms::alpakatools {
         // allocate device memory
         return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
       } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
-        // allocate pinned host memory
-        return alpaka::allocMappedBuf<std::byte, size_t>(device_, alpaka::getDev(queue), bytes);
+        // allocate pinned host memory accessible by the queue's platform
+        return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<Queue>>, std::byte, size_t>(device_, bytes);
       } else {
         // unsupported combination
         static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
diff --git a/code/alpaka_interface/TransferToHost.h b/code/alpaka_interface/TransferToHost.h
new file mode 100644
index 00000000..e6bacef1
--- /dev/null
+++ b/code/alpaka_interface/TransferToHost.h
@@ -0,0 +1,21 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_TransferToHost_h
+#define HeterogeneousCore_AlpakaInterface_interface_TransferToHost_h
+
+// TODO: better package?
+
+namespace cms::alpakatools {
+  // TODO: would a more informative error message from compiler than "indeterminate type" be helpful?
+  template <typename TDeviceData>
+  struct TransferToHost;
+
+  // specialization expected to define
+  // using HostDataType = <corresponding host data type>
+  //
+  // template <typename TQueue>
+  // static HostDataType transferAsync(TQueue& queue, TDeviceData const& deviceData);
+  //
+  // The function should allocate a HostDataType object and launch the
+  // transfers
+}  // namespace cms::alpakatools
+
+#endif
diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h
index 94e0e7cc..ee466f94 100644
--- a/code/alpaka_interface/getDeviceCachingAllocator.h
+++ b/code/alpaka_interface/getDeviceCachingAllocator.h
@@ -1,6 +1,7 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
 #define HeterogeneousCore_AlpakaInterface_interface_getDeviceCachingAllocator_h
 
+#include <cassert>
 #include <memory>
 
 #include "thread_safety_macros.h"
diff --git a/code/alpaka_interface/host.h b/code/alpaka_interface/host.h
index 0303313d..acb9c9a9 100644
--- a/code/alpaka_interface/host.h
+++ b/code/alpaka_interface/host.h
@@ -1,6 +1,8 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_host_h
 #define HeterogeneousCore_AlpakaInterface_interface_host_h
 
+#include <cassert>
+
 namespace cms::alpakatools {
 
   namespace detail {
@@ -19,7 +21,7 @@ namespace cms::alpakatools {
   }  // namespace detail
 
   // returns the alpaka host device
-  static inline alpaka::DevCpu const& host() {
+  inline alpaka::DevCpu const& host() {
     static const auto host = detail::enumerate_host();
     return host;
   }
diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h
index cbdc6fc0..f572ab45 100644
--- a/code/alpaka_interface/memory.h
+++ b/code/alpaka_interface/memory.h
@@ -77,6 +77,26 @@ namespace cms::alpakatools {
     return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host(), Vec1D{std::extent_v<T>});
   }
 
+  // non-cached, pinned, scalar and 1-dimensional host buffers
+  // the memory is pinned according to the device associated to the platform
+
+  template <typename T, typename TPlatform>
+  std::enable_if_t<not std::is_array_v<T>, host_buffer<T>> make_host_buffer() {
+    return alpaka::allocMappedBuf<TPlatform, T, Idx>(host(), Scalar{});
+  }
+
+  template <typename T, typename TPlatform>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer(Extent extent) {
+    return alpaka::allocMappedBuf<TPlatform, std::remove_extent_t<T>, Idx>(host(), Vec1D{extent});
+  }
+
+  template <typename T, typename TPlatform>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer() {
+    return alpaka::allocMappedBuf<TPlatform, std::remove_extent_t<T>, Idx>(host(), Vec1D{std::extent_v<T>});
+  }
+
   // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue
   // the memory is pinned according to the device associated to the queue
 
@@ -85,7 +105,7 @@ namespace cms::alpakatools {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
       return allocCachedBuf<T, Idx>(host(), queue, Scalar{});
     } else {
-      return alpaka::allocMappedBuf<T, Idx>(host(), alpaka::getDev(queue), Scalar{});
+      return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<TQueue>>, T, Idx>(host(), Scalar{});
     }
   }
 
@@ -96,7 +116,8 @@ namespace cms::alpakatools {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
       return allocCachedBuf<std::remove_extent_t<T>, Idx>(host(), queue, Vec1D{extent});
     } else {
-      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(host(), alpaka::getDev(queue), Vec1D{extent});
+      return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<TQueue>>, std::remove_extent_t<T>, Idx>(host(),
+                                                                                                     Vec1D{extent});
     }
   }
 
@@ -107,8 +128,8 @@ namespace cms::alpakatools {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
       return allocCachedBuf<std::remove_extent_t<T>, Idx>(host(), queue, Vec1D{std::extent_v<T>});
     } else {
-      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(
-          host(), alpaka::getDev(queue), Vec1D{std::extent_v<T>});
+      return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<TQueue>>, std::remove_extent_t<T>, Idx>(
+          host(), Vec1D{std::extent_v<T>});
     }
   }
 
diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h
index 8235a416..3083cda7 100644
--- a/code/alpaka_interface/traits.h
+++ b/code/alpaka_interface/traits.h
@@ -32,37 +32,34 @@ namespace cms::alpakatools {
   // is_platform
 
   template <typename T>
-  struct is_platform
-      : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptPltf, T>::value> {};
+  using is_platform = alpaka::concepts::ImplementsConcept<alpaka::ConceptPltf, T>;
 
   template <typename T>
-  constexpr bool is_platform_v = is_platform<T>::value;
+  inline constexpr bool is_platform_v = is_platform<T>::value;
 
   // is_device
 
   template <typename T>
-  struct is_device : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptDev, T>::value> {};
+  using is_device = alpaka::concepts::ImplementsConcept<alpaka::ConceptDev, T>;
 
   template <typename T>
-  constexpr bool is_device_v = is_device<T>::value;
+  inline constexpr bool is_device_v = is_device<T>::value;
 
   // is_accelerator
 
   template <typename T>
-  struct is_accelerator
-      : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptAcc, T>::value> {};
+  using is_accelerator = alpaka::concepts::ImplementsConcept<alpaka::ConceptAcc, T>;
 
   template <typename T>
-  constexpr bool is_accelerator_v = is_accelerator<T>::value;
+  inline constexpr bool is_accelerator_v = is_accelerator<T>::value;
 
   // is_queue
 
   template <typename T>
-  struct is_queue : std::integral_constant<bool, alpaka::concepts::ImplementsConcept<alpaka::ConceptQueue, T>::value> {
-  };
+  using is_queue = alpaka::concepts::ImplementsConcept<alpaka::ConceptQueue, T>;
 
   template <typename T>
-  constexpr bool is_queue_v = is_queue<T>::value;
+  inline constexpr bool is_queue_v = is_queue<T>::value;
 
 }  // namespace cms::alpakatools
 
diff --git a/code/alpaka_interface/vec.h b/code/alpaka_interface/vec.h
new file mode 100644
index 00000000..4126eecf
--- /dev/null
+++ b/code/alpaka_interface/vec.h
@@ -0,0 +1,42 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h
+#define HeterogeneousCore_AlpakaInterface_interface_vec_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+namespace alpaka {
+
+  //! \return The element-wise minimum of one or more vectors.
+  ALPAKA_NO_HOST_ACC_WARNING
+  template <typename TDim,
+            typename TVal,
+            typename... Vecs,
+            typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+  ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
+    Vec<TDim, TVal> r;
+    if constexpr (TDim::value > 0) {
+      for (typename TDim::value_type i = 0; i < TDim::value; ++i)
+        r[i] = std::min({p[i], qs[i]...});
+    }
+    return r;
+  }
+
+  //! \return The element-wise maximum of one or more vectors.
+  ALPAKA_NO_HOST_ACC_WARNING
+  template <typename TDim,
+            typename TVal,
+            typename... Vecs,
+            typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+  ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
+    Vec<TDim, TVal> r;
+    if constexpr (TDim::value > 0) {
+      for (typename TDim::value_type i = 0; i < TDim::value; ++i)
+        r[i] = std::max({p[i], qs[i]...});
+    }
+    return r;
+  }
+
+}  // namespace alpaka
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_vec_h
diff --git a/code/alpaka_interface/workdivision.h b/code/alpaka_interface/workdivision.h
new file mode 100644
index 00000000..fd3e10b3
--- /dev/null
+++ b/code/alpaka_interface/workdivision.h
@@ -0,0 +1,266 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
+#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "config.h"
+#include "traits.h"
+#include "vec.h"
+
+namespace cms::alpakatools {
+
+  using namespace alpaka_common;
+
+  // If the first argument is not a multiple of the second argument, round it up to the next multiple
+  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
+
+  // Return the integer division of the first argument by the second argument, rounded up to the next integer
+  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
+
+  // Create an accelerator-dependent work division for 1-dimensional kernels
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#if ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - the number of threads per block is always 1;
+      //   - the number of elements per thread is "elements".
+      return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
+    }
+  }
+
+  // Create the accelerator-dependent workdiv for N-dimensional kernels
+  template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
+                                                 const Vec<alpaka::Dim<TAcc>>& elements) {
+    using Dim = alpaka::Dim<TAcc>;
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - the number of threads per block is always 1;
+      //   - the number of elements per thread is "elements".
+      return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
+    }
+  }
+
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class elements_with_stride {
+  public:
+    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
+    class iterator {
+      friend class elements_with_stride;
+
+      ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{std::min(first, extent)},
+            index_{first_},
+            last_{std::min(first + elements, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the index along the elements processed by the current thread
+        ++index_;
+        if (index_ < last_)
+          return *this;
+
+        // increment the thread index with the grid stride
+        first_ += stride_ * elements_;
+        index_ = first_;
+        last_ = std::min(first_ + elements_, extent_);
+        if (index_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        last_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx elements_;
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+      Idx index_;
+      Idx last_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+
+  private:
+    const Idx elements_;
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  class elements_with_stride_nd {
+  public:
+    using Dim = alpaka::Dim<TAcc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          extent_{extent} {}
+
+    class iterator {
+      friend class elements_with_stride_nd;
+      constexpr static const auto last_dimension = Dim::value - 1;
+
+      ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{alpaka::elementwise_min(first, extent)},
+            index_{first_},
+            last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}
+
+    public:
+      ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the index along the elements processed by the current thread
+        ++index_[last_dimension];
+        if (index_[last_dimension] < last_)
+          return *this;
+
+        // increment the thread index along with the last dimension with the grid stride
+        first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
+        index_[last_dimension] = first_[last_dimension];
+        last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
+        if (index_[last_dimension] < extent_[last_dimension])
+          return *this;
+
+        // increment the thread index along the outer dimensions with the grid stride
+        if constexpr (last_dimension > 0)
+          for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
+            first_[dimension] += stride_[dimension];
+            index_[dimension] = first_[dimension];
+            if (index_[dimension] < extent_[dimension])
+              return *this;
+          }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        last_ = extent_[last_dimension];
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Vec elements_;
+      Vec stride_;
+      Vec extent_;
+      // modified by the pre/post-increment operator
+      Vec first_;
+      Vec index_;
+      Idx last_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+
+  private:
+    const Vec elements_;
+    const Vec first_;
+    const Vec stride_;
+    const Vec extent_;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
diff --git a/setup_cgpu.sh b/setup_cgpu.sh
index a30c0bf8..3d9b909f 100644
--- a/setup_cgpu.sh
+++ b/setup_cgpu.sh
@@ -7,7 +7,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 source $DIR/code/rooutil/thisrooutil.sh
 
 export SCRAM_ARCH=el8_amd64_gcc10
-export CMSSW_VERSION=CMSSW_12_5_0_pre5
+export CMSSW_VERSION=CMSSW_13_0_0_pre2
 export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/
 
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/home/users/phchang/public_html/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
 export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
 #eof
diff --git a/setup_lnx7188.sh b/setup_lnx7188.sh
index a56a41c3..69ae75a0 100644
--- a/setup_lnx7188.sh
+++ b/setup_lnx7188.sh
@@ -7,7 +7,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 source $DIR/code/rooutil/thisrooutil.sh
 
 export SCRAM_ARCH=el8_amd64_gcc10
-export CMSSW_VERSION=CMSSW_12_5_0_pre5
+export CMSSW_VERSION=CMSSW_13_0_0_pre2
 export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/
 
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -36,7 +36,7 @@ export LSTPERFORMANCEWEBDIR="/cdat/tem/${USER}/LSTPerformanceWeb"
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0/etc/profile.d/init.sh
+source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
 export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220621-4e96939afa0cdb62448c73ead2bb07e0"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
 #eof

From 37d4217c5e3d38f7d414aadc0d4f169f1fe3fd4f Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Thu, 13 Jul 2023 21:33:08 -0400
Subject: [PATCH 40/44] remove no_host_acc_warnings and cleanup

---
 SDL/Hit.cuh            | 2 --
 SDL/Kernels.cuh        | 5 -----
 SDL/MiniDoublet.cuh    | 7 -------
 SDL/PixelTriplet.cuh   | 2 --
 SDL/Quintuplet.cuh     | 3 ---
 SDL/Segment.cuh        | 4 ----
 SDL/TrackCandidate.cuh | 7 -------
 SDL/Triplet.cuh        | 3 ---
 8 files changed, 33 deletions(-)

diff --git a/SDL/Hit.cuh b/SDL/Hit.cuh
index df348127..f4651e1a 100644
--- a/SDL/Hit.cuh
+++ b/SDL/Hit.cuh
@@ -215,7 +215,6 @@ namespace SDL
 
     struct moduleRangesKernel
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
             TAcc const & acc,
@@ -246,7 +245,6 @@ namespace SDL
 
     struct hitLoopKernel
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
             TAcc const & acc,
diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index 51ff3e95..a3f761eb 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -202,7 +202,6 @@ namespace SDL
 
     struct removeDupQuintupletsInGPUAfterBuild
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -273,7 +272,6 @@ namespace SDL
 
     struct removeDupQuintupletsInGPUBeforeTC
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -359,7 +357,6 @@ namespace SDL
 
     struct removeDupPixelTripletsInGPUFromMap
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -407,7 +404,6 @@ namespace SDL
 
     struct removeDupPixelQuintupletsInGPUFromMap
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -452,7 +448,6 @@ namespace SDL
 
     struct checkHitspLS
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/MiniDoublet.cuh b/SDL/MiniDoublet.cuh
index 1574a662..ef8abcd8 100644
--- a/SDL/MiniDoublet.cuh
+++ b/SDL/MiniDoublet.cuh
@@ -770,7 +770,6 @@ namespace SDL
 
     struct createMiniDoubletsInGPUv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -792,10 +791,6 @@ namespace SDL
                 int nLowerHits = hitsInGPU.hitRangesnLower[lowerModuleIndex];
                 int nUpperHits = hitsInGPU.hitRangesnUpper[lowerModuleIndex];
                 if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) continue;
-                if(hitsInGPU.hitRangesLower[lowerModuleIndex] == -1)
-                {
-                    printf("IS THIS EVER RUN");
-                }
                 const int maxHits = alpaka::math::max(acc, nUpperHits, nLowerHits);
                 unsigned int upHitArrayIndex = hitsInGPU.hitRangesUpper[lowerModuleIndex];
                 unsigned int loHitArrayIndex = hitsInGPU.hitRangesLower[lowerModuleIndex];
@@ -844,7 +839,6 @@ namespace SDL
 
     struct createMDArrayRangesGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -914,7 +908,6 @@ namespace SDL
 
     struct addMiniDoubletRangesToEventExplicit
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/PixelTriplet.cuh b/SDL/PixelTriplet.cuh
index a8e76c2e..7f0a1f16 100644
--- a/SDL/PixelTriplet.cuh
+++ b/SDL/PixelTriplet.cuh
@@ -846,7 +846,6 @@ namespace SDL
 
     struct createPixelTripletsInGPUFromMapv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2246,7 +2245,6 @@ namespace SDL
 
     struct createPixelQuintupletsInGPUFromMapv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Quintuplet.cuh b/SDL/Quintuplet.cuh
index e29d8bde..67e9fb54 100644
--- a/SDL/Quintuplet.cuh
+++ b/SDL/Quintuplet.cuh
@@ -2164,7 +2164,6 @@ namespace SDL
 
     struct createQuintupletsInGPUv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2259,7 +2258,6 @@ namespace SDL
 
     struct createEligibleModulesListForQuintupletsGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -2335,7 +2333,6 @@ namespace SDL
 
     struct addQuintupletRangesToEventExplicit
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Segment.cuh b/SDL/Segment.cuh
index ab8dd13d..c3d2475f 100644
--- a/SDL/Segment.cuh
+++ b/SDL/Segment.cuh
@@ -678,7 +678,6 @@ namespace SDL
 
     struct createSegmentsInGPUv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -755,7 +754,6 @@ namespace SDL
 
     struct createSegmentArrayRanges
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -833,7 +831,6 @@ namespace SDL
 
     struct addSegmentRangesToEventExplicit
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -866,7 +863,6 @@ namespace SDL
 
     struct addPixelSegmentToEventKernel
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
             TAcc const & acc,
diff --git a/SDL/TrackCandidate.cuh b/SDL/TrackCandidate.cuh
index 250f4558..12a602f2 100644
--- a/SDL/TrackCandidate.cuh
+++ b/SDL/TrackCandidate.cuh
@@ -190,7 +190,6 @@ namespace SDL
 
     struct crossCleanpT3
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -239,7 +238,6 @@ namespace SDL
 
     struct crossCleanT5
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -306,7 +304,6 @@ namespace SDL
     // This will eliminate the need for another kernel just for adding the pLS, because we can __syncthreads()
     struct crossCleanpLS
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -393,7 +390,6 @@ namespace SDL
 
     struct addpT3asTrackCandidatesInGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -429,7 +425,6 @@ namespace SDL
 
     struct addT5asTrackCandidateInGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -467,7 +462,6 @@ namespace SDL
 
     struct addpLSasTrackCandidateInGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -497,7 +491,6 @@ namespace SDL
 
     struct addpT5asTrackCandidateInGPU
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
diff --git a/SDL/Triplet.cuh b/SDL/Triplet.cuh
index 08c59d34..49454a90 100644
--- a/SDL/Triplet.cuh
+++ b/SDL/Triplet.cuh
@@ -1296,7 +1296,6 @@ namespace SDL
 
     struct createTripletsInGPUv2
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -1372,7 +1371,6 @@ namespace SDL
 
     struct createTripletArrayRanges
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,
@@ -1449,7 +1447,6 @@ namespace SDL
 
     struct addTripletRangesToEventExplicit
     {
-        ALPAKA_NO_HOST_ACC_WARNING
         template<typename TAcc>
         ALPAKA_FN_ACC void operator()(
                 TAcc const & acc,

From 37cd75fe7b705ef16e05b8f462f0407da96d6732 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Fri, 14 Jul 2023 12:59:25 -0400
Subject: [PATCH 41/44] turn off half precision code

---
 SDL/Constants.cuh | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index d3b698cd..854fe419 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -6,56 +6,54 @@
 
 // CUDA headers. Will be removed soon.
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#include <cuda.h>
 #include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #endif
 
 //This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
-#if defined(FP16_Base) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-#define __F2H __float2half  
-#define __H2F __half2float  
-typedef __half FPX;
+#if defined(FP16_Base)
+#define __F2H //__float2half  
+#define __H2F //__half2float  
+typedef float FPX;
 #else
 #define __F2H
 #define __H2F
 typedef float FPX; 
 #endif
 
-#if defined(FP16_T5) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes T5 regression values
-#define __F2H_T5 __float2half  
-#define __H2F_T5 __half2float  
-typedef __half FPX_T5;
+#if defined(FP16_T5) // changes T5 regression values
+#define __F2H_T5 //__float2half  
+#define __H2F_T5 //__half2float  
+typedef float FPX_T5;
 #else
 #define __F2H_T5
 #define __H2F_T5
 typedef float FPX_T5; 
 #endif
 
-#if defined(FP16_dPhi) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment dPhi values
-#define __F2H_dPhi __float2half  
-#define __H2F_dPhi __half2float  
-typedef __half FPX_dPhi;
+#if defined(FP16_dPhi) // changes segment dPhi values
+#define __F2H_dPhi //__float2half  
+#define __H2F_dPhi //__half2float  
+typedef float FPX_dPhi;
 #else
 #define __F2H_dPhi
 #define __H2F_dPhi
 typedef float FPX_dPhi; 
 #endif
 
-#if defined(FP16_circle) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // changes segment circle values
-#define __F2H_circle __float2half  
-#define __H2F_circle __half2float  
-typedef __half FPX_circle;
+#if defined(FP16_circle) // changes segment circle values
+#define __F2H_circle //__float2half  
+#define __H2F_circle //__half2float  
+typedef float FPX_circle;
 #else
 #define __F2H_circle
 #define __H2F_circle
 typedef float FPX_circle; 
 #endif
 
-#if defined(FP16_seg) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED)  // changes segment values
-#define __F2H_seg __float2half  
-#define __H2F_seg __half2float  
-typedef __half FPX_seg;
+#if defined(FP16_seg) // changes segment values
+#define __F2H_seg //__float2half  
+#define __H2F_seg //__half2float  
+typedef float FPX_seg;
 #else
 #define __F2H_seg
 #define __H2F_seg

From 8b966b7cc7d61c461115c15e8c3936a9b8121752 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Fri, 14 Jul 2023 13:21:50 -0400
Subject: [PATCH 42/44] remove unused wrapper functions

---
 SDL/Constants.cuh | 51 +++++------------------------------------------
 1 file changed, 5 insertions(+), 46 deletions(-)

diff --git a/SDL/Constants.cuh b/SDL/Constants.cuh
index 854fe419..e52de941 100644
--- a/SDL/Constants.cuh
+++ b/SDL/Constants.cuh
@@ -4,60 +4,19 @@
 #include <alpaka/alpaka.hpp>
 #include "../code/alpaka_interface/CachedBufAlloc.h"
 
-// CUDA headers. Will be removed soon.
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
 #include <cuda_fp16.h>
 #endif
 
-//This changes pT5 and pT3 and T3 completely. T5 for non regression parameters
+// Half precision wrapper functions, turned off.
 #if defined(FP16_Base)
-#define __F2H //__float2half  
-#define __H2F //__half2float  
-typedef float FPX;
+#define __F2H //__float2half
+#define __H2F //__half2float
+typedef /*__half*/ float FPX;
 #else
 #define __F2H
 #define __H2F
-typedef float FPX; 
-#endif
-
-#if defined(FP16_T5) // changes T5 regression values
-#define __F2H_T5 //__float2half  
-#define __H2F_T5 //__half2float  
-typedef float FPX_T5;
-#else
-#define __F2H_T5
-#define __H2F_T5
-typedef float FPX_T5; 
-#endif
-
-#if defined(FP16_dPhi) // changes segment dPhi values
-#define __F2H_dPhi //__float2half  
-#define __H2F_dPhi //__half2float  
-typedef float FPX_dPhi;
-#else
-#define __F2H_dPhi
-#define __H2F_dPhi
-typedef float FPX_dPhi; 
-#endif
-
-#if defined(FP16_circle) // changes segment circle values
-#define __F2H_circle //__float2half  
-#define __H2F_circle //__half2float  
-typedef float FPX_circle;
-#else
-#define __F2H_circle
-#define __H2F_circle
-typedef float FPX_circle; 
-#endif
-
-#if defined(FP16_seg) // changes segment values
-#define __F2H_seg //__float2half  
-#define __H2F_seg //__half2float  
-typedef float FPX_seg;
-#else
-#define __F2H_seg
-#define __H2F_seg
-typedef float FPX_seg; 
+typedef float FPX;
 #endif
 
 using Idx = std::size_t;

From 9a84b03a5aad32a38a6ccce3ee3fea9be4ac9947 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sun, 16 Jul 2023 12:49:53 -0400
Subject: [PATCH 43/44] removed unused score variables

---
 SDL/Kernels.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/SDL/Kernels.cuh b/SDL/Kernels.cuh
index a3f761eb..ed845382 100644
--- a/SDL/Kernels.cuh
+++ b/SDL/Kernels.cuh
@@ -372,15 +372,13 @@ namespace SDL
 
             for (unsigned int ix = globalThreadIdx[1]; ix < *pixelTripletsInGPU.nPixelTriplets; ix += gridThreadExtent[1])
             {
-                float score1 = __H2F(pixelTripletsInGPU.score[ix]);
                 for(unsigned int jx = globalThreadIdx[2]; jx < *pixelTripletsInGPU.nPixelTriplets; jx += gridThreadExtent[2])
                 {
-                    float score2 = __H2F(pixelTripletsInGPU.score[jx]);
                     if(ix == jx)
                         continue;
 
                     int nMatched[2];
-                    checkHitspT3(ix,jx,pixelTripletsInGPU,nMatched);
+                    checkHitspT3(ix, jx, pixelTripletsInGPU, nMatched);
                     if((nMatched[0] + nMatched[1]) >= 5)
                     {
                         // Check the layers

From 46b91b8e5fae97b960786b5a1ee64fce341fb085 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sun, 16 Jul 2023 15:41:13 -0400
Subject: [PATCH 44/44] group addXtoEventExplicit functions

---
 SDL/Event.cu | 140 +++++++++++++++++++++++++--------------------------
 1 file changed, 70 insertions(+), 70 deletions(-)

diff --git a/SDL/Event.cu b/SDL/Event.cu
index 98fef91e..46595d63 100644
--- a/SDL/Event.cu
+++ b/SDL/Event.cu
@@ -388,76 +388,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
     alpaka::wait(queue);
 }
 
-void SDL::Event::addMiniDoubletsToEventExplicit()
-{
-    auto nMDsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules);
-
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
-
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
-
-    auto module_hitRanges_buf = allocBufWrapper<int>(devHost, nLowerModules*2, queue);
-    alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2);
-
-    alpaka::wait(queue);
-
-    int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf);
-    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
-    short* module_layers = alpaka::getPtrNative(module_layers_buf);
-    int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf);
-
-    for(unsigned int i = 0; i<nLowerModules; i++)
-    {
-        if(!(nMDsCPU[i] == 0 or module_hitRanges[i * 2] == -1))
-        {
-            if(module_subdets[i] == Barrel)
-            {
-                n_minidoublets_by_layer_barrel_[module_layers[i] -1] += nMDsCPU[i];
-            }
-            else
-            {
-                n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i];
-            }
-        }
-    }
-}
-
-void SDL::Event::addSegmentsToEventExplicit()
-{
-    auto nSegmentsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules);
-
-    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
-
-    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
-    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
-
-    alpaka::wait(queue);
-
-    int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf);
-    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
-    short* module_layers = alpaka::getPtrNative(module_layers_buf);
-
-    for(unsigned int i = 0; i<nLowerModules; i++)
-    {
-        if(!(nSegmentsCPU[i] == 0))
-        {
-            if(module_subdets[i] == Barrel)
-            {
-                n_segments_by_layer_barrel_[module_layers[i] - 1] += nSegmentsCPU[i];
-            }
-            else
-            {
-                n_segments_by_layer_endcap_[module_layers[i] -1] += nSegmentsCPU[i];
-            }
-        }
-    }
-}
-
 void SDL::Event::createMiniDoublets()
 {
     // Create a view for the element nLowerModules inside rangesBuffers->miniDoubletModuleOccupancy
@@ -1207,6 +1137,76 @@ void SDL::Event::createPixelQuintuplets()
 #endif
 }
 
+void SDL::Event::addMiniDoubletsToEventExplicit()
+{
+    auto nMDsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules);
+
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
+
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    auto module_hitRanges_buf = allocBufWrapper<int>(devHost, nLowerModules*2, queue);
+    alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules*2);
+
+    alpaka::wait(queue);
+
+    int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
+    int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf);
+
+    for(unsigned int i = 0; i<nLowerModules; i++)
+    {
+        if(!(nMDsCPU[i] == 0 or module_hitRanges[i * 2] == -1))
+        {
+            if(module_subdets[i] == Barrel)
+            {
+                n_minidoublets_by_layer_barrel_[module_layers[i] -1] += nMDsCPU[i];
+            }
+            else
+            {
+                n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i];
+            }
+        }
+    }
+}
+
+void SDL::Event::addSegmentsToEventExplicit()
+{
+    auto nSegmentsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules);
+
+    auto module_subdets_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, module_subdets_buf, modulesBuffers->subdets_buf, nLowerModules);
+
+    auto module_layers_buf = allocBufWrapper<short>(devHost, nLowerModules, queue);
+    alpaka::memcpy(queue, module_layers_buf, modulesBuffers->layers_buf, nLowerModules);
+
+    alpaka::wait(queue);
+
+    int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf);
+    short* module_subdets = alpaka::getPtrNative(module_subdets_buf);
+    short* module_layers = alpaka::getPtrNative(module_layers_buf);
+
+    for(unsigned int i = 0; i<nLowerModules; i++)
+    {
+        if(!(nSegmentsCPU[i] == 0))
+        {
+            if(module_subdets[i] == Barrel)
+            {
+                n_segments_by_layer_barrel_[module_layers[i] - 1] += nSegmentsCPU[i];
+            }
+            else
+            {
+                n_segments_by_layer_endcap_[module_layers[i] -1] += nSegmentsCPU[i];
+            }
+        }
+    }
+}
+
 void SDL::Event::addQuintupletsToEventExplicit()
 {
     auto nQuintupletsCPU_buf = allocBufWrapper<int>(devHost, nLowerModules, queue);