-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathgpuClusterTracksByDensity.h
234 lines (200 loc) · 7.42 KB
/
gpuClusterTracksByDensity.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
#include <algorithm>
#include <cmath>
#include <cstdint>
#include "CUDACore/HistoContainer.h"
#include "CUDACore/cuda_assert.h"
#include "gpuVertexFinder.h"
namespace gpuVertexFinder {
// this algo does not really scale as it works in a single block...
// enough for <10K tracks we have
//
// based on Rodrighez&Laio algo
//
__device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster
) {
using namespace gpuVertexFinder;
constexpr bool verbose = false; // in principle the compiler should optmize out if false
if (verbose && 0 == threadIdx.x)
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
auto er2mx = errmax * errmax;
auto& __restrict__ data = *pdata;
auto& __restrict__ ws = *pws;
auto nt = ws.ntrks;
float const* __restrict__ zt = ws.zt;
float const* __restrict__ ezt2 = ws.ezt2;
uint32_t& nvFinal = data.nvFinal;
uint32_t& nvIntermediate = ws.nvIntermediate;
uint8_t* __restrict__ izt = ws.izt;
int32_t* __restrict__ nn = data.ndof;
int32_t* __restrict__ iv = ws.iv;
assert(pdata);
assert(zt);
using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
__shared__ Hist hist;
__shared__ typename Hist::Counter hws[32];
for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
hist.off[j] = 0;
}
__syncthreads();
if (verbose && 0 == threadIdx.x)
printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
assert(nt <= hist.capacity());
// fill hist (bin shall be wider than "eps")
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
assert(i < ZVertices::MAXTRACKS);
int iz = int(zt[i] * 10.); // valid if eps<=0.1
// iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only
iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
izt[i] = iz - INT8_MIN;
assert(iz - INT8_MIN >= 0);
assert(iz - INT8_MIN < 256);
hist.count(izt[i]);
iv[i] = i;
nn[i] = 0;
}
__syncthreads();
if (threadIdx.x < 32)
hws[threadIdx.x] = 0; // used by prefix scan...
__syncthreads();
hist.finalize(hws);
__syncthreads();
assert(hist.size() == nt);
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
hist.fill(izt[i], uint16_t(i));
}
__syncthreads();
// count neighbours
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (ezt2[i] > er2mx)
continue;
auto loop = [&](uint32_t j) {
if (i == j)
return;
auto dist = std::abs(zt[i] - zt[j]);
if (dist > eps)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return;
nn[i]++;
};
cms::cuda::forEachInBins(hist, izt[i], 1, loop);
}
__syncthreads();
// find closest above me .... (we ignore the possibility of two j at same distance from i)
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
float mdist = eps;
auto loop = [&](uint32_t j) {
if (nn[j] < nn[i])
return;
if (nn[j] == nn[i] && zt[j] >= zt[i])
return; // if equal use natural order...
auto dist = std::abs(zt[i] - zt[j]);
if (dist > mdist)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return; // (break natural order???)
mdist = dist;
iv[i] = j; // assign to cluster (better be unique??)
};
cms::cuda::forEachInBins(hist, izt[i], 1, loop);
}
__syncthreads();
#ifdef GPU_DEBUG
// mini verification
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] != int(i))
assert(iv[iv[i]] != int(i));
}
__syncthreads();
#endif
// consolidate graph (percolate index of seed)
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
auto m = iv[i];
while (m != iv[m])
m = iv[m];
iv[i] = m;
}
#ifdef GPU_DEBUG
__syncthreads();
// mini verification
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] != int(i))
assert(iv[iv[i]] != int(i));
}
#endif
#ifdef GPU_DEBUG
// and verify that we did not spit any cluster...
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
auto minJ = i;
auto mdist = eps;
auto loop = [&](uint32_t j) {
if (nn[j] < nn[i])
return;
if (nn[j] == nn[i] && zt[j] >= zt[i])
return; // if equal use natural order...
auto dist = std::abs(zt[i] - zt[j]);
if (dist > mdist)
return;
if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
return;
mdist = dist;
minJ = j;
};
cms::cuda::forEachInBins(hist, izt[i], 1, loop);
// should belong to the same cluster...
assert(iv[i] == iv[minJ]);
assert(nn[i] <= nn[iv[i]]);
}
__syncthreads();
#endif
__shared__ unsigned int foundClusters;
foundClusters = 0;
__syncthreads();
// find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
// mark these tracks with a negative id.
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] == int(i)) {
if (nn[i] >= minT) {
auto old = atomicInc(&foundClusters, 0xffffffff);
iv[i] = -(old + 1);
} else { // noise
iv[i] = -9998;
}
}
}
__syncthreads();
assert(foundClusters < ZVertices::MAXVTX);
// propagate the negative id to all the tracks in the cluster.
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] >= 0) {
// mark each track in a cluster with the same id as the first one
iv[i] = iv[iv[i]];
}
}
__syncthreads();
// adjust the cluster id to be a positive value starting from 0
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
iv[i] = -iv[i] - 1;
}
nvIntermediate = nvFinal = foundClusters;
if (verbose && 0 == threadIdx.x)
printf("found %d proto vertices\n", foundClusters);
}
__global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster
) {
clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
}
} // namespace gpuVertexFinder
#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h