From 25bca57efa970c25a703aebf611bedb96d199fab Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Mon, 8 Jul 2019 11:23:07 -0600 Subject: [PATCH 01/11] Initial commit of the Project Orville wideband gridder and imager. --- python/bifrost/orville.py | 266 +++++++++ src/bifrost/orville.h | 53 ++ src/bifrost/romein.h | 51 ++ src/orville.cu | 1095 +++++++++++++++++++++++++++++++++++++ src/romein_kernels.cuh | 42 ++ 5 files changed, 1507 insertions(+) create mode 100644 python/bifrost/orville.py create mode 100644 src/bifrost/orville.h create mode 100644 src/bifrost/romein.h create mode 100644 src/orville.cu create mode 100644 src/romein_kernels.cuh diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py new file mode 100644 index 000000000..0bba31dfe --- /dev/null +++ b/python/bifrost/orville.py @@ -0,0 +1,266 @@ +# Copyright (c) 2018, The Bifrost Authors. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of The Bifrost Authors nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from libbifrost import _bf, _check, _get, BifrostObject, _string2space +from ndarray import ndarray, zeros, asarray, memset_array, copy_array + +from bifrost.fft import Fft +from bifrost.map import map +from bifrost.reduce import reduce + +import ctypes +import numpy + +import time + +from bifrost.device import stream_synchronize as BFSync + +class Orville(BifrostObject): + def __init__(self): + BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy) + def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'): + bspace = _string2space(space) + psize = None + _check( _bf.bfOrvilleInit(self.obj, + asarray(positions).as_BFarray(), + asarray(weights).as_BFarray(), + asarray(kernel).as_BFarray(), + gridsize, + gridres, + gridwres, + oversample, + polmajor, + bspace, + 0, + psize) ) + + # Cache the kernel for later + self._kernel = kernel + + # Cache some metdata for later + self._origsize = gridsize + self._res = gridres + self._maxsupport = kernel.size/oversample + self._oversample = oversample + self._dtype = kernel.dtype + self._space = space + + # Build the image correction kernel and the gridding kernels + self._set_image_correction_kernel() + self._set_projection_kernels() + + def __del__(self): + try: + del self._subgrid + del self._stcgrid + del self._w_kernels + del self._fft + except AttributeError: + pass + + BifrostObject.__del__(self) + + def _get_gridding_setup(self, force=False): + try: + self._ntimechan + if force: + raise AttributeError + except AttributeError: + # Get the projection setup information + ntimechan, gridsize, npol, nplane = ctypes.c_int(0), ctypes.c_int(0), ctypes.c_int(0), ctypes.c_int(0) + midpoints = zeros(128, dtype='f32', space='system') + _check( _bf.bfOrvilleGetProjectionSetup(self.obj, + ctypes.byref(ntimechan), + ctypes.byref(gridsize), + ctypes.byref(npol), + ctypes.byref(nplane), + asarray(midpoints).as_BFarray()) ) + ntimechan, gridsize, npol, nplane = ntimechan.value, gridsize.value, npol.value, nplane.value + midpoints = midpoints[:nplane] + self._ntimechan = ntimechan + self._gridsize = gridsize + self._npol = npol + self._nplane = nplane + self._midpoints = midpoints + print "Working with %i planes" % self._nplane + + def _set_image_correction_kernel(self, force=False): + self._get_gridding_setup(force=force) + + # Cleanup the old image correction kernel + try: + del self._img_correction + except AttributeError: + pass + + # Get the new image correction kernel + corr = numpy.zeros(self._gridsize*self._oversample, dtype=self._dtype) + c = corr.size/2 + d = self._kernel.size/2 + corr[c-d:c+d] = self._kernel.copy(space='system') + corr[:c-d] = numpy.arange(c-d) * corr[c-d]/(c-d) + corr[c+d:] = corr[c-d] - numpy.arange(c-d) * corr[c-d]/(c-d) + corr = numpy.fft.ifftshift(corr) + corr = numpy.fft.ifft(corr).real + corr = numpy.fft.fftshift(corr) + corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample + #import pylab + #pylab.plot(corr) + #pylab.plot(1/corr) + #pylab.show() + corr = numpy.fft.fftshift(corr) + self._img_correction = ndarray(corr, dtype='f32', space='cuda') + + def _get_lm(self, gridsize, gridres): + m,l = numpy.indices((gridsize,gridsize)) + l,m = numpy.where(l > gridsize/2, gridsize-l, -l), numpy.where(m > gridsize/2, m-gridsize, m) + l,m = l.astype(numpy.float32)/gridsize/gridres, m.astype(numpy.float32)/gridsize/gridres + return l,m + + def _set_projection_kernels(self): + self._get_gridding_setup() + + # Set the sub and stacked grids + try: + del self._subgrid + del self._stcgrid + except AttributeError: + pass + self._subgrid = zeros((self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize), + dtype=self._dtype, + space=self._space) + self._stcgrid = zeros((1,self._ntimechan,self._npol,self._gridsize,self._gridsize), + dtype=self._dtype, + space=self._space) + + # Set the w projection kernels + try: + del self._w_kernels + except AttributeError: + pass + w_kernels = numpy.zeros((self._nplane,self._gridsize,self._gridsize), dtype=self._dtype) + l,m = self._get_lm(self._gridsize, self._res) + theta = numpy.sqrt(1.0 + 0.0j - l**2 - m**2) + for i,avg_w in enumerate(self._midpoints): + sub_w_kernel = numpy.exp(2j*numpy.pi*avg_w*(1.0-theta)) / self._gridsize**2 / self._gridsize**2 + w_kernels[i,:,:] = sub_w_kernel + self._w_kernels = ndarray(w_kernels, space=self._space) + + def set_positions(self, positions): + _check( _bf.bfOrvilleSetPositions(self.obj, + asarray(positions).as_BFarray()) ) + self._set_projection_kernels(force=True) + + def set_weights(self, weights): + weights = numpy.fft.fftshift(weights) + _check( _bf.bfOrvilleSetWeights(self.obj, + asarray(weights).as_BFarray()) ) + + def set_kernel(self, kernel): + self._kernel = kernel + _check( _bf.bfOrvilleSetKernel(self.obj, + asarray(kernel).as_BFarray()) ) + self._set_image_correction_kernel() + + def execute(self, idata, odata): + # TODO: Work out how to integrate CUDA stream + # Zero out the subgrids + memset_array(self._subgrid, 0) + self._stcgrid = self._stcgrid.reshape(1,self._ntimechan,self._npol,self._gridsize,self._gridsize) + + # Grid + _check( _bf.bfOrvilleExecute(self.obj, + asarray(idata).as_BFarray(), + asarray(self._subgrid).as_BFarray()) ) + junk = self._subgrid.copy(space='system') + print junk.max() + + if self._subgrid.shape[0] > 1: + # W project + ## FFT + self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize) + try: + self._fft.execute(self._subgrid, self._subgrid, inverse=True) + except AttributeError: + self._fft = Fft() + self._fft.init(self._subgrid, self._subgrid, axes=(1,2)) + self._fft.execute(self._subgrid, self._subgrid, inverse=True) + self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize) + + ## Project + map('a(w,c,p,i,j) *= b(w,i,j)', + {'a':self._subgrid, 'b':self._w_kernels}, + axis_names=('w','c','p','i','j'), shape=self._subgrid.shape) + + ## IFFT + self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize) + self._fft.execute(self._subgrid, self._subgrid, inverse=False) + self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize) + junk = self._subgrid.copy(space='system') + print 'post-W', junk.max() + + # Stack + reduce(self._subgrid, self._stcgrid, op='sum') + + else: + copy_array(self._stcgrid, self._subgrid) + + # (u,v) plane -> image + ## IFFT + self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize) + try: + self._fft.execute(self._stcgrid, self._stcgrid, inverse=True) + except AttributeError: + self._fft = Fft() + self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2)) + self._fft.execute(self._stcgrid, self._stcgrid, inverse=True) + junk = self._stcgrid.copy(space='system') + print 'post-IFFT', junk.max() + + ## Correct for the kernel + map('img(c,i,j) /= corr(i)*corr(j)', + {'img':self._stcgrid, 'corr':self._img_correction}, + axis_names=('c','i','j'), shape=self._stcgrid.shape) + junk = self._stcgrid.copy(space='system') + print 'post-corr', junk.max() + + # Shift and accumulate onto odata + oshape = odata.shape + odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) + padding = self._gridsize - self._origsize + offset = padding/2# - padding%2 + map(""" + auto k = i + {offset} - {gridsize}/2; + auto l = j + {offset} - {gridsize}/2; + if( k < 0 ) k += {gridsize}; + if( l < 0 ) l += {gridsize}; + odata(c,i,j) += idata(c,k,l).real; + """.format(gridsize=self._gridsize, offset=offset), + {'odata':odata, 'idata':self._stcgrid}, + axis_names=('c','i','j'), shape=odata.shape) + odata = odata.reshape(oshape) + + return odata diff --git a/src/bifrost/orville.h b/src/bifrost/orville.h new file mode 100644 index 000000000..dc9bf5c26 --- /dev/null +++ b/src/bifrost/orville.h @@ -0,0 +1,53 @@ +#ifndef BF_ORVILLE_H_INCLUDE_GUARD_ +#define BF_ORVILLE_INCLUDE_GUARD_ + +// Bifrost Includes +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BForville_impl* BForville; + +BFstatus bfOrvilleCreate(BForville* plan); +BFstatus bfOrvilleInit(BForville plan, + BFarray const* positions, + BFarray const* weights, + BFarray const* kernel1D, + BFsize gridsize, + double gridres, + double gridwres, + BFsize oversample, + BFbool polmajor, + BFspace space, + void* plan_storage, + BFsize* plan_storage_size); +BFstatus bfOrvilleSetStream(BForville plan, + void const* stream); +BFstatus bfOrvilleGetStream(BForville plan, + void* stream); +BFstatus bfOrvilleSetPositions(BForville plan, + BFarray const* positions); +BFstatus bfOrvilleGetProjectionSetup(BForville plan, + int* ntimechan, + int* gridsize, + int* npol, + int* nplane, + BFarray const* midpoints); +BFstatus bfOrvilleSetWeights(BForville plan, + BFarray const* weights); +BFstatus bfOrvilleSetKernel(BForville plan, + BFarray const* kernel1D); +BFstatus bfOrvilleExecute(BForville plan, + BFarray const* in, + BFarray const* out); +BFstatus bfOrvilleDestroy(BForville plan); + +#ifdef __cplusplus +} +#endif + +#endif // BF_ORVILLE_H_INCLUDE_GUARD diff --git a/src/bifrost/romein.h b/src/bifrost/romein.h new file mode 100644 index 000000000..0f0cedc73 --- /dev/null +++ b/src/bifrost/romein.h @@ -0,0 +1,51 @@ +#ifndef BF_ROMEIN_H_INCLUDE_GUARD_ +#define BF_ROMEIN_H_INCLUDE_GUARD_ + +// Bifrost Includes +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct BFromein_impl* BFromein; + +BFstatus bfRomeinCreate(BFromein* plan); +BFstatus bfRomeinInit(BFromein plan, + BFarray const* positions, + BFarray const* kernels, + BFsize ngrid, + BFbool polmajor); +BFstatus bfRomeinSetStream(BFromein plan, + void const* stream); +BFstatus bfRomeinSetPositions(BFromein plan, + BFarray const* positions); +BFstatus bfRomeinSetKernels(BFromein plan, + BFarray const* kernels); +BFstatus bfRomeinExecute(BFromein plan, + BFarray const* in, + BFarray const* out); +BFstatus bfRomeinDestroy(BFromein plan); + +/***************************** + Host Functions + *****************************/ + +BFstatus romein_float(BFarray const* data, + BFarray const* uvgrid, + BFarray const* illum, + BFarray const* data_xloc, + BFarray const* data_yloc, + BFarray const* data_zloc, + int max_support, + int grid_size, + int data_size, + int nbatch); + +#ifdef __cplusplus +} +#endif + +#endif // BF_ROMEIN_H_INCLUDE_GUARD diff --git a/src/orville.cu b/src/orville.cu new file mode 100644 index 000000000..c4abbae79 --- /dev/null +++ b/src/orville.cu @@ -0,0 +1,1095 @@ +/* + * Copyright (c) 2018, The Bifrost Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of The Bifrost Authors nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + +Implements the Romein convolutional algorithm onto a GPU using CUDA. + +*/ +#include +#include +#include "romein_kernels.cuh" + +#include "assert.hpp" +#include "trace.hpp" +#include "utils.hpp" +#include "workspace.hpp" +#include "cuda.hpp" +#include "cuda/stream.hpp" + +#include "Complex.hpp" + +#include + +#define MAX_W_PLANES 128 + +struct __attribute__((aligned(1))) nibble2 { + // Yikes! This is dicey since the packing order is implementation dependent! + signed char y:4, x:4; +}; + +struct __attribute__((aligned(1))) blenib2 { + // Yikes! This is dicey since the packing order is implementation dependent! + signed char x:4, y:4; +}; + + +template +__host__ __device__ +inline Complex Complexfcma(Complex x, Complex y, Complex d) { + RealType real_res; + RealType imag_res; + + real_res = (x.x * y.x) + d.x; + imag_res = (x.x * y.y) + d.y; + + real_res = (x.y * y.y) + real_res; + imag_res = -(x.y * y.x) + imag_res; + + return Complex(real_res, imag_res); +} + + +template +__device__ +inline OutType fetch_cf(cudaTextureObject_t tex, + int idx) { + return OutType(0); +} + +template<> +__device__ +inline Complex32 fetch_cf(cudaTextureObject_t tex, + int idx) { + float2 p = tex1Dfetch(tex, idx); + return Complex32(p.x, p.y); +} + +template<> +__device__ +inline Complex64 fetch_cf(cudaTextureObject_t tex, + int idx) { + uint4 p = tex1Dfetch(tex, idx); + return Complex64(__hiloint2double(p.x, p.y), + __hiloint2double(p.z, p.w)); +} + + +template +__device__ +inline OutType WeightAndInterp2D(OutType w, + cudaTextureObject_t kernel_tex, + IndexType x, + IndexType y, + int size) { + OutType kx0 = fetch_cf(kernel_tex, (int) x); + OutType kx1 = fetch_cf(kernel_tex, (int) x + 1); + OutType ky0 = fetch_cf(kernel_tex, (int) y); + OutType ky1 = fetch_cf(kernel_tex, (int) y + 1); + + OutType kx = ((int)x + 1 - x)*kx0 + (x - (int)x)*kx1; + OutType ky = ((int)y + 1 - y)*ky0 + (y - (int)y)*ky1; + + return w*kx*ky; +} + + +template +__global__ void orville_kernel_spln(int nbaseline, + int npol, + int maxsupport, + int oversample, + int gridsize, + double gridres, + int nbatch, + int nplane, + const float* __restrict__ planes, + const float* __restrict__ x, + const float* __restrict__ y, + const float* __restrict__ z, + const OutType* __restrict__ weight, + cudaTextureObject_t kernel_tex, + const InType* __restrict__ d_in, + OutType* d_out) { + int batch_no = blockIdx.x; + int pol_no = threadIdx.y; + int vi_s = batch_no*nbaseline*npol+pol_no; + int grid_s = batch_no*npol*gridsize*gridsize + pol_no*gridsize*gridsize; + + extern __shared__ float shared[]; + + float* zplanes = shared; + + for(int i = threadIdx.x; i < nplane+1; i += blockDim.x){ + zplanes[i] = planes[i]; + } + + __syncthreads(); + + for(int i = threadIdx.x; i < maxsupport * maxsupport; i += blockDim.x) { + int myU = i % maxsupport; + int myV = i / maxsupport; + + int grid_point_u = myU; + int grid_point_v = myV; + int grid_point_p = 0; + OutType sum = OutType(0.0, 0.0); + + int vi, plane, ci_s; + vi = plane = 0; + ci_s = (vi + vi_s) / npol; + while(plane < nplane) { + if( z[ci_s] >= zplanes[plane] && z[ci_s] < zplanes[plane+1] ) { + break; + } + plane++; + } + + for(vi = 0; vi < (nbaseline*npol); vi+=npol) { + ci_s = (vi + vi_s) / npol; + //if( x[ci_s]/gridres < -gridsize/2 ) continue; + //if( x[ci_s]/gridres > gridsize/2 ) continue; + //if( y[ci_s]/gridres < -gridsize/2 ) continue; + //if( y[ci_s]/gridres > gridsize/2 ) continue; + float xf = x[ci_s]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2); + int xl = (int) round(xf); + float yf = y[ci_s]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2); + int yl = (int) round(yf); + + // Find the correct z-plane to grid to + if( z[ci_s] < zplanes[plane] || z[ci_s] >= zplanes[plane+1] ) { + plane = 0; + while(plane < nplane) { + if( z[ci_s] >= zplanes[plane] && z[ci_s] < zplanes[plane+1] ) { + break; + } + plane++; + } + } + + // Determine convolution point. This is basically just an + // optimised way to calculate. + //int myConvU = myU - u; + //int myConvV = myV - v; + int myConvU = 0; + int myConvV = 0; + if( maxsupport > 1 ) { + myConvU = (xl - myU) % maxsupport; + myConvV = (yl - myV) % maxsupport; + if (myConvU < 0) myConvU += maxsupport; + if (myConvV < 0) myConvV += maxsupport; + } + float myConvUO = myConvU*oversample - ((xf - xl)*oversample) + oversample/2 - (1 - oversample%2); + float myConvVO = myConvV*oversample - ((yf - yl)*oversample) + oversample/2 - (1 - oversample%2); + if (myConvUO < 0) myConvUO += maxsupport*oversample; + if (myConvVO < 0) myConvVO += maxsupport*oversample; + + // Determine grid point. Because of the above we know here that + // myGridU % max_supp = myU + // myGridV % max_supp = myV + int myGridU = xl + myConvU - gridsize/2; + int myGridV = yl + myConvV - gridsize/2; +// if( myGridU <= -gridsize/2 ) continue; +// if( myGridV <= -gridsize/2 ) continue; +// if( myGridU >= gridsize/2 ) continue; +// if( myGridU >= gridsize/2 ) continue; + //if( myGridU + gridsize/2 >= gridsize - maxsupport/2 ) continue; + //if( myGridV + gridsize/2 >= gridsize - maxsupport/2 ) continue; + if( myGridU < 0 ) myGridU += gridsize; + if( myGridV < 0 ) myGridV += gridsize; + + // Grid point changed? + if (myGridU == grid_point_u && myGridV == grid_point_v && plane == grid_point_p) { + // Nothin' + } else { + // Atomically add to grid. This is the bottleneck of this kernel. + if( grid_point_u >= 0 && grid_point_u < gridsize && \ + grid_point_v >= 0 && grid_point_v < gridsize ) { + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x); + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y); + } + // Switch to new point + sum = OutType(0.0, 0.0); + grid_point_u = myGridU; + grid_point_v = myGridV; + grid_point_p = plane; + } + + //TODO: Re-do the w-kernel/gcf for our data. + OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); + // Sum up + InType temp = d_in[vi+vi_s]; + OutType vi_v = OutType(temp.x, temp.y); + sum = Complexfcma(px, vi_v, sum); + } + + if( grid_point_u >= 0 && grid_point_u < gridsize && \ + grid_point_v >= 0 && grid_point_v < gridsize ) { + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x); + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y); + } + } +} + + +template +__global__ void orville_kernel_sloc(int nbaseline, + int npol, + int maxsupport, + int oversample, + int gridsize, + double gridres, + int nbatch, + int nplane, + const float* __restrict__ planes, + const float* __restrict__ x, + const float* __restrict__ y, + const float* __restrict__ z, + const OutType* __restrict__ weight, + cudaTextureObject_t kernel_tex, + const InType* __restrict__ d_in, + OutType* d_out) { + int batch_no = blockIdx.x; + int pol_no = threadIdx.y; + int vi_s = batch_no*nbaseline*npol+pol_no; + int grid_s = batch_no*npol*gridsize*gridsize + pol_no*gridsize*gridsize; + + extern __shared__ float shared[]; + + float* xdata = shared; + float* ydata = xdata + nbaseline * npol; + float* zdata = ydata + nbaseline * npol; + + for(int i = threadIdx.x; i < nbaseline; i += blockDim.x){ + xdata[i*npol + pol_no] = x[(vi_s + npol * i) / npol]; + ydata[i*npol + pol_no] = y[(vi_s + npol * i) / npol]; + zdata[i*npol + pol_no] = z[(vi_s + npol * i) / npol]; + } + + __syncthreads(); + + for(int i = threadIdx.x; i < maxsupport * maxsupport; i += blockDim.x) { + int myU = i % maxsupport; + int myV = i / maxsupport; + + int grid_point_u = myU; + int grid_point_v = myV; + int grid_point_p = 0; + OutType sum = OutType(0.0, 0.0); + + int vi, plane; + vi = plane = 0; + while(plane < nplane) { + if( z[vi+vi_s] >= planes[plane] && z[vi+vi_s] < planes[plane+1] ) { + break; + } + plane++; + } + + for(vi = 0; vi < (nbaseline*npol); vi+=npol) { + float xf = xdata[vi+pol_no]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2); + int xl = (int) (xf); + float yf = ydata[vi+pol_no]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2); + int yl = (int) (yf); + + // Find the correct z-plane to grid to + if( z[vi+pol_no] < planes[plane] || z[vi+pol_no] >= planes[plane+1] ) { + plane = 0; + while(plane < nplane) { + if( zdata[vi+pol_no] >= planes[plane] && zdata[vi+pol_no] < planes[plane+1] ) { + break; + } + plane++; + } + } + + // Determine convolution point. This is basically just an + // optimised way to calculate. + //int myConvU = myU - u; + //int myConvV = myV - v; + int myConvU = 0; + int myConvV = 0; + if( maxsupport > 1 ) { + myConvU = (xl - myU) % maxsupport; + myConvV = (yl - myV) % maxsupport; + if (myConvU < 0) myConvU += maxsupport; + if (myConvV < 0) myConvV += maxsupport; + } + float myConvUO = myConvU*oversample - ((xf - xl)*oversample) + oversample/2 - (1 - oversample%2); + float myConvVO = myConvV*oversample - ((yf - yl)*oversample) + oversample/2 - (1 - oversample%2); + if (myConvUO < 0) myConvUO += maxsupport*oversample; + if (myConvVO < 0) myConvVO += maxsupport*oversample; + + // Determine grid point. Because of the above we know here that + // myGridU % max_supp = myU + // myGridV % max_supp = myV + int myGridU = xl + myConvU - gridsize/2; + int myGridV = yl + myConvV - gridsize/2; + if( myGridU < 0 ) myGridU += gridsize; + if( myGridV < 0 ) myGridV += gridsize; + + // Grid point changed? + if (myGridU == grid_point_u && myGridV == grid_point_v && plane == grid_point_p) { + // Nothin' + } else { + // Atomically add to grid. This is the bottleneck of this kernel. + if( grid_point_u >= 0 && grid_point_u < gridsize && \ + grid_point_v >= 0 && grid_point_v < gridsize ) { + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x); + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y); + } + // Switch to new point + sum = OutType(0.0, 0.0); + grid_point_u = myGridU; + grid_point_v = myGridV; + grid_point_p = plane; + } + + //TODO: Re-do the w-kernel/gcf for our data. + OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); + // Sum up + InType temp = d_in[vi+vi_s]; + OutType vi_v = OutType(temp.x, temp.y); + sum = Complexfcma(px, vi_v, sum); + } + + if( grid_point_u >= 0 && grid_point_u < gridsize && \ + grid_point_v >= 0 && grid_point_v < gridsize ) { + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x); + atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y); + } + } +} + + +template +inline void launch_orville_kernel(int nbaseline, + int npol, + bool polmajor, + int maxsupport, + int oversample, + int gridsize, + double gridres, + int nbatch, + int nplane, + float* planes, + float* xpos, + float* ypos, + float* zpos, + OutType* weight, + OutType* kernel, + InType* d_in, + OutType* d_out, + cudaStream_t stream=0) { + //cout << "LAUNCH for " << nelement << endl; + dim3 block(8,1); + dim3 grid(nbatch*npol,1); + if( polmajor ) { + npol = 1; + } else { + block.y = npol; + grid.x = nbatch; + } + /* + cout << " Block size is " << block.x << " by " << block.y << endl; + cout << " Grid size is " << grid.x << " by " << grid.y << endl; + */ + + // Determine how to create the texture object + // NOTE: Assumes some type of complex float + cudaChannelFormatKind channel_format = cudaChannelFormatKindFloat; + int dx = 32; + int dy = 32; + int dz = 0; + int dw = 0; + if( sizeof(OutType) == sizeof(Complex64) ) { + channel_format = cudaChannelFormatKindUnsigned; + dz = 32; + dw = 32; + } + + // Create texture object + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = kernel; + resDesc.res.linear.desc.f = channel_format; + resDesc.res.linear.desc.x = dx; + resDesc.res.linear.desc.y = dy; + resDesc.res.linear.desc.z = dz; + resDesc.res.linear.desc.w = dw; + resDesc.res.linear.sizeInBytes = maxsupport*oversample*sizeof(OutType); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + cudaTextureObject_t kernel_tex; + BF_CHECK_CUDA_EXCEPTION(cudaCreateTextureObject(&kernel_tex, &resDesc, &texDesc, NULL), + BF_STATUS_INTERNAL_ERROR); + + void* args[] = {&nbaseline, + &npol, + &maxsupport, + &oversample, + &gridsize, + &gridres, + &nbatch, + &nplane, + &planes, + &xpos, + &ypos, + &zpos, + &weight, + &kernel_tex, + &d_in, + &d_out}; + size_t loc_size = 3 * nbaseline * npol * sizeof(float); + size_t shared_mem_size = 16384; //Just keep this vanilla for now + if(loc_size <= shared_mem_size) { + BF_CHECK_CUDA_EXCEPTION(cudaLaunchKernel((void*)orville_kernel_sloc, + grid, block, + &args[0], 3*nbaseline*npol*sizeof(float), stream), + BF_STATUS_INTERNAL_ERROR); + } else { + BF_CHECK_CUDA_EXCEPTION(cudaLaunchKernel((void*)orville_kernel_spln, + grid, block, + &args[0], (nplane+1)*sizeof(float), stream), + BF_STATUS_INTERNAL_ERROR); + } + + BF_CHECK_CUDA_EXCEPTION(cudaDestroyTextureObject(kernel_tex), + BF_STATUS_INTERNAL_ERROR); +} + + +int compare_z_values(const void* a, const void* b) { + if( *(float*)a < *(float*)b ) return -1; + if( *(float*)a > *(float*)b ) return 1; + return 0; +} + +template +InType signed_sqrt(InType x) { + if( x < 0 ) { + return -sqrt(-x); + } else { + return sqrt(x); + } +} + +class BForville_impl { + typedef int IType; + typedef double FType; +public: // HACK WAR for what looks like a bug in the CUDA 7.0 compiler + typedef float DType; +private: + IType _ntimechan; + IType _nbaseline; + IType _npol; + bool _polmajor; + IType _maxsupport; + IType _oversample; + IType _gridsize; + FType _gridres; + FType _gridwres; + IType _nxyz = 0; + float* _x = NULL; + float* _y = NULL; + float* _z = NULL; + void* _weight = NULL; + BFdtype _tkernel = BF_DTYPE_INT_TYPE; + void* _kernel = NULL; + IType _nplane = 0; + float* _planes = NULL; + float* _midpoints = NULL; + IType _plan_stride; + Workspace _plan_storage; + // TODO: Use something other than Thrust + thrust::device_vector _dv_plan_storage; + cudaStream_t _stream; +public: + BForville_impl() : _ntimechan(1), _nbaseline(1), _npol(1), _polmajor(true), \ + _maxsupport(1), _oversample(1), _stream(g_cuda_stream) {} + inline IType ntimechan() const { return _ntimechan; } + inline IType nbaseline() const { return _nbaseline; } + inline IType npol() const { return _npol; } + inline bool polmajor() const { return _polmajor; } + inline IType maxsupport() const { return _maxsupport; } + inline IType oversample() const { return _oversample; } + inline IType gridsize() const { return _gridsize; } + inline FType gridres() const { return _gridres; } + inline FType grdiwres() const { return _gridwres; } + inline IType nxyz() const { return _nxyz; } + inline IType nplane() const { return _nplane; } + inline IType tkernel() const { return _tkernel; } + void init(IType ntimechan, + IType nbaseline, + IType npol, + bool polmajor, + IType maxsupport, + IType oversample, + IType gridsize, + FType gridres, + FType gridwres) { + BF_TRACE(); + _ntimechan = ntimechan; + _nbaseline = nbaseline; + _npol = npol; + _polmajor = polmajor; + _maxsupport = maxsupport; + _oversample = oversample; + _gridsize = gridsize + maxsupport; + // Find an optimal grid size that allows for at least 'maxsupport' padding + // NOTE: "Optimal" here means that is can be factored into 2, 3, 5, and 7, + // exclusively. + IType i, value; + bool found_opt_gridsize = false; + for(i=_gridsize; i<(IType) round(_gridsize*1.15); i++) { + value = i; + while( value > 1 ) { + if( value % 2 == 0 ) { + value /= 2; + } else if( value % 3 == 0 ) { + value /= 3; + } else if( value % 5 == 0 ) { + value /= 5; + } else if( value % 7 == 0 ) { + value /= 7; + } else { + break; + } + } + if( value == 1 ) { + found_opt_gridsize = true; + _gridsize = i; + break; + } + } + BF_ASSERT_EXCEPTION(found_opt_gridsize, BF_STATUS_INVALID_SHAPE); + _gridres = 1.0 / (2.0*_gridsize) / sin(gridres*M_PI/180.0/2.0); + _gridwres = gridwres; + } + bool init_plan_storage(void* storage_ptr, BFsize* storage_size) { + BF_TRACE(); + BF_TRACE_STREAM(_stream); + enum { + ALIGNMENT_BYTES = 512, + ALIGNMENT_ELMTS = ALIGNMENT_BYTES / sizeof(float) + }; + Workspace workspace(ALIGNMENT_BYTES); + _plan_stride = round_up(MAX_W_PLANES+1, ALIGNMENT_ELMTS); + workspace.reserve(MAX_W_PLANES+1, &_planes); + workspace.reserve(MAX_W_PLANES+1, &_midpoints); + if( storage_size ) { + if( !storage_ptr ) { + // Return required storage size + *storage_size = workspace.size(); + return false; + } else { + BF_ASSERT_EXCEPTION(*storage_size >= workspace.size(), + BF_STATUS_INSUFFICIENT_STORAGE); + } + } else { + // Auto-allocate storage + BF_ASSERT_EXCEPTION(!storage_ptr, BF_STATUS_INVALID_ARGUMENT); + _dv_plan_storage.resize(workspace.size()); + storage_ptr = thrust::raw_pointer_cast(&_dv_plan_storage[0]); + } + workspace.commit(storage_ptr); + + this->reset_state(); + return true; + } + void set_positions(BFarray const* positions) { + BF_TRACE(); + BF_TRACE_STREAM(_stream); + BF_ASSERT_EXCEPTION(positions->dtype == BF_DTYPE_F32, BF_STATUS_UNSUPPORTED_DTYPE); + + int npositions = positions->shape[1]; + int nbl = positions->shape[positions->ndim-1]; + int stride = positions->shape[1]; + for(int i=2; indim-1; ++i) { + npositions *= positions->shape[i]; + stride *= positions->shape[i]; + } + stride *= positions->shape[positions->ndim-1]; + _nxyz = npositions; + _x = (float *) positions->data; + _y = _x + stride; + _z = _y + stride; + + float* zsorted; + zsorted = (float*) malloc(npositions*nbl*sizeof(float)); + + BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(zsorted, + _z, + npositions*nbl*sizeof(float), + cudaMemcpyDeviceToHost, + _stream), + BF_STATUS_MEM_OP_FAILED); + + BF_CHECK_CUDA_EXCEPTION(cudaStreamSynchronize(_stream), + BF_STATUS_DEVICE_ERROR ); + + qsort(zsorted, npositions*nbl, sizeof(float), &compare_z_values); + /* + std::cout << npositions*nbl << " z values range from " << *(zsorted+0) << " to " << *(zsorted+npositions-1) << std::endl; + */ + + _nplane = 1; + int count = 1; + float* planes; + float* midpoints; + planes = (float*) malloc((MAX_W_PLANES+1)*sizeof(float)); + midpoints = (float*) malloc((MAX_W_PLANES+1)*sizeof(float)); + *(planes + 0) = *(zsorted + 0); + *(midpoints + 0) = *(zsorted + 0); + for(int i=1; i signed_sqrt(*(planes + _nplane - 1)) + _gridwres ) { + *(midpoints + _nplane - 1) /= count; + count = 0; + + _nplane++; + BF_ASSERT_EXCEPTION(_nplane < MAX_W_PLANES, BF_STATUS_INTERNAL_ERROR); + *(planes + _nplane - 1) = *(zsorted + i); + *(midpoints + _nplane - 1) = *(zsorted + i); + count++; + } else { + *(midpoints + _nplane - 1) += *(zsorted + i); + count++; + } + } + *(planes + _nplane) = *(zsorted + npositions*nbl - 1) + _gridwres; + *(midpoints + _nplane - 1) += *(zsorted + npositions*nbl - 1); + count++; + *(midpoints + _nplane - 1) /= count; + + /* std::cout << "Planes are:" << std::endl; + for(int i=0; i<_nplane; i++) { + std::cout << " " << i << ": " << *(planes + i) << " to " << *(planes + i + 1) \ + << " with " << *(midpoints + i) << std::endl; + } */ + + BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_planes, + planes, + (MAX_W_PLANES+1)*sizeof(float), + cudaMemcpyHostToDevice, + _stream), + BF_STATUS_MEM_OP_FAILED); + BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_midpoints, + midpoints, + (MAX_W_PLANES+1)*sizeof(float), + cudaMemcpyHostToDevice, + _stream), + BF_STATUS_MEM_OP_FAILED); + + BF_CHECK_CUDA_EXCEPTION(cudaStreamSynchronize(_stream), + BF_STATUS_DEVICE_ERROR ); + + free(zsorted); + free(planes); + free(midpoints); + } + void get_midpoints(BFarray const* midpoints) { + BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(midpoints->data, + _midpoints, + _nplane*sizeof(float), + cudaMemcpyDeviceToHost, + _stream), + BF_STATUS_MEM_OP_FAILED); + + BF_CHECK_CUDA_EXCEPTION( cudaStreamSynchronize(_stream), + BF_STATUS_DEVICE_ERROR ); + + BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR); + } + void set_weights(BFarray const* weights) { + BF_TRACE(); + BF_TRACE_STREAM(_stream); + BF_ASSERT_EXCEPTION(weights->dtype == BF_DTYPE_CF32 \ + || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE); + + if( _kernel == NULL ) { + _tkernel = weights->dtype; + } + _weight = (void*) weights->data; + } + void set_kernel(BFarray const* kernel) { + BF_TRACE(); + BF_TRACE_STREAM(_stream); + BF_ASSERT_EXCEPTION(kernel->dtype == BF_DTYPE_CF32 \ + || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE); + + if( _weight == NULL ) { + _tkernel = kernel->dtype; + } + _kernel = (void*) kernel->data; + } + void reset_state() { + BF_ASSERT_EXCEPTION(_planes != NULL, BF_STATUS_INVALID_STATE); + + BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR); + + // Reset the state + _nplane = 0; + BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_planes, + 0, + sizeof(float)*(MAX_W_PLANES+1), + _stream), + BF_STATUS_MEM_OP_FAILED ); + BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_midpoints, + 0, + sizeof(float)*(MAX_W_PLANES+1), + _stream), + BF_STATUS_MEM_OP_FAILED ); + + BF_CHECK_CUDA_EXCEPTION( cudaStreamSynchronize(_stream), + BF_STATUS_DEVICE_ERROR ); + + BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR); + } + void execute(BFarray const* in, BFarray const* out) { + BF_TRACE(); + BF_TRACE_STREAM(_stream); + BF_ASSERT_EXCEPTION(_x != NULL, BF_STATUS_INVALID_STATE); + BF_ASSERT_EXCEPTION(_y != NULL, BF_STATUS_INVALID_STATE); + BF_ASSERT_EXCEPTION(_z != NULL, BF_STATUS_INVALID_STATE); + BF_ASSERT_EXCEPTION(_weight != NULL, BF_STATUS_INVALID_STATE); + BF_ASSERT_EXCEPTION(_kernel != NULL, BF_STATUS_INVALID_STATE); + BF_ASSERT_EXCEPTION(out->dtype == BF_DTYPE_CF32 \ + || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE); + + BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR); + + int nbatch = in->shape[0]; + +#define LAUNCH_ORVILLE_KERNEL(IterType,OterType) \ + launch_orville_kernel(_nbaseline, _npol, _polmajor, _maxsupport, _oversample, \ + _gridsize, _gridres, nbatch, _nplane, _planes, \ + _x, _y, _z, (OterType)_weight, (OterType)_kernel, \ + (IterType)in->data, (OterType)out->data, \ + _stream) + + switch( in->dtype ) { + case BF_DTYPE_CI4: + if( in->big_endian ) { + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(nibble2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(nibble2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + }; + } else { + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(blenib2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(blenib2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + }; + } + break; + case BF_DTYPE_CI8: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(char2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(char2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + }; + break; + case BF_DTYPE_CI16: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(short2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(short2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } + break; + case BF_DTYPE_CI32: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(int2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(int2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } + break; + case BF_DTYPE_CI64: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(long2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(long2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } + break; + case BF_DTYPE_CF32: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(float2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(float2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } + break; + case BF_DTYPE_CF64: + switch( out->dtype ) { + case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(double2*, Complex32*); break; + case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(double2*, Complex64*); break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } + break; + default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE); + } +#undef LAUNCH_ORVILLE_KERNEL + + BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR); + } + cudaStream_t get_stream() { + return _stream; + } + void set_stream(cudaStream_t stream) { + _stream = stream; + } +}; + +BFstatus bfOrvilleCreate(BForville* plan_ptr) { + BF_TRACE(); + BF_ASSERT(plan_ptr, BF_STATUS_INVALID_POINTER); + BF_TRY_RETURN_ELSE(*plan_ptr = new BForville_impl(), + *plan_ptr = 0); +} + +BFstatus bfOrvilleInit(BForville plan, + BFarray const* positions, + BFarray const* weights, + BFarray const* kernel1D, + BFsize gridsize, + double gridres, + double gridwres, + BFsize oversample, + BFbool polmajor, + BFspace space, + void* plan_storage, + BFsize* plan_storage_size) { + BF_TRACE(); + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(positions, BF_STATUS_INVALID_POINTER); + BF_ASSERT(positions->ndim >= 3, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(positions->shape[0] == 3, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(space_accessible_from(positions->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + BF_ASSERT(weights, BF_STATUS_INVALID_POINTER); + BF_ASSERT(weights->ndim >= 3, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(space_accessible_from(weights->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + BF_ASSERT(kernel1D, BF_STATUS_INVALID_POINTER); + BF_ASSERT(kernel1D->ndim == 1, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(space_accessible_from(kernel1D->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + BF_ASSERT(space_accessible_from(space, BF_SPACE_CUDA), + BF_STATUS_UNSUPPORTED_SPACE); + + // Discover the dimensions of the positions/weights/kernel. + int npositions, nweights,nbaseline, npol, maxsupport; + npositions = positions->shape[1]; + for(int i=2; indim-1; ++i) { + npositions *= positions->shape[i]; + } + + nweights = weights->shape[0]; + for(int i=1; indim-2; ++i) { + nweights *= weights->shape[i]; + } + if( polmajor ) { + npol = weights->shape[weights->ndim-2]; + nbaseline = weights->shape[weights->ndim-1]; + } else { + nbaseline = weights->shape[weights->ndim-2]; + npol = weights->shape[weights->ndim-1]; + } + + std::cout << "npositions: " << npositions << std::endl; + std::cout << "nbaseline: " << nbaseline << std::endl; + std::cout << "npol: " << npol << std::endl; + std::cout << "nweights: " << nweights << std::endl; + + maxsupport = kernel1D->shape[kernel1D->ndim-1] / oversample; + + // Validate + BF_ASSERT(npositions == nweights, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(positions->shape[positions->ndim-1] == nbaseline, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(maxsupport % 2 == 1, BF_STATUS_UNSUPPORTED_SHAPE); +// BF_ASSERT(oversample % 2 == 1, BF_STATUS_UNSUPPORTED_SHAPE); + + BF_TRY(plan->init(npositions, nbaseline, npol, polmajor, maxsupport, oversample, gridsize, gridres, gridwres)); + BF_TRY(plan->init_plan_storage(plan_storage, plan_storage_size)); + BF_TRY(plan->set_positions(positions)); + BF_TRY(plan->set_weights(weights)); + BF_TRY_RETURN(plan->set_kernel(kernel1D)); +} +BFstatus bfOrvilleGetStream(BForville plan, + void* stream) { + BF_TRACE(); + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(stream, BF_STATUS_INVALID_POINTER); + BF_TRY_RETURN(*(cudaStream_t*)stream = plan->get_stream()); +} +BFstatus bfOrvilleSetStream(BForville plan, + void const* stream) { + BF_TRACE(); + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(stream, BF_STATUS_INVALID_POINTER); + BF_TRY_RETURN(plan->set_stream(*(cudaStream_t*)stream)); +} +BFstatus bfOrvilleSetPositions(BForville plan, + BFarray const* positions) { + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(positions, BF_STATUS_INVALID_POINTER); + BF_ASSERT(positions->ndim >= 3, BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(positions->shape[positions->ndim-2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(positions->shape[0] == 3, BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(space_accessible_from(positions->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + + BF_TRY_RETURN(plan->set_positions(positions)); +} +BFstatus bfOrvilleGetProjectionSetup(BForville plan, + int* ntimechan, + int* gridsize, + int* npol, + int* nplane, + BFarray const* midpoints) { + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(midpoints, BF_STATUS_INVALID_POINTER); + BF_ASSERT(midpoints->ndim == 1, BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(midpoints->shape[0] >= plan->nplane(), BF_STATUS_INVALID_SHAPE); + + *ntimechan = plan->ntimechan(); + *gridsize = plan->gridsize(); + *npol = plan->npol(); + *nplane = plan->nplane(); + + BF_TRY_RETURN(plan->get_midpoints(midpoints)); +} +BFstatus bfOrvilleSetWeights(BForville plan, + BFarray const* weights) { + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(weights, BF_STATUS_INVALID_POINTER); + BF_ASSERT(weights->ndim >= 3, BF_STATUS_INVALID_SHAPE ); + if( plan->polmajor() ) { + BF_ASSERT(weights->shape[weights->ndim-2] == plan->npol(), BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(weights->shape[weights->ndim-1] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE ); + } else { + BF_ASSERT(weights->shape[weights->ndim-2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(weights->shape[weights->ndim-1] == plan->npol(), BF_STATUS_INVALID_SHAPE ); + } + BF_ASSERT(space_accessible_from(weights->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + + if( plan->tkernel() != BF_DTYPE_INT_TYPE) { + BF_ASSERT(weights->dtype == plan->tkernel(), BF_STATUS_UNSUPPORTED_DTYPE); + } + + BF_TRY_RETURN(plan->set_weights(weights)); +} +BFstatus bfOrvilleSetKernel(BForville plan, + BFarray const* kernel1D) { + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(kernel1D, BF_STATUS_INVALID_POINTER); + BF_ASSERT(kernel1D->ndim == 1, BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(kernel1D->shape[kernel1D->ndim-1] == plan->maxsupport()*plan->oversample(), \ + BF_STATUS_INVALID_SHAPE ); + BF_ASSERT(space_accessible_from(kernel1D->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + + if( plan->tkernel() != BF_DTYPE_INT_TYPE) { + BF_ASSERT(kernel1D->dtype == plan->tkernel(), BF_STATUS_UNSUPPORTED_DTYPE); + } + + BF_TRY_RETURN(plan->set_kernel(kernel1D)); +} +BFstatus bfOrvilleExecute(BForville plan, + BFarray const* in, + BFarray const* out) { + BF_TRACE(); + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + BF_ASSERT(in, BF_STATUS_INVALID_POINTER); + BF_ASSERT(out, BF_STATUS_INVALID_POINTER); + BF_ASSERT( in->ndim >= 3, BF_STATUS_INVALID_SHAPE); + BF_ASSERT(out->ndim == in->ndim+2, BF_STATUS_INVALID_SHAPE); + + BFarray in_flattened; + if( in->ndim > 3 ) { + // Keep the last two dim but attempt to flatten all others + unsigned long keep_dims_mask = padded_dims_mask(in); + keep_dims_mask |= 0x1 << (in->ndim-1); + keep_dims_mask |= 0x1 << (in->ndim-2); + keep_dims_mask |= 0x1 << (in->ndim-3); + flatten(in, &in_flattened, keep_dims_mask); + in = &in_flattened; + BF_ASSERT(in_flattened.ndim == 3, BF_STATUS_UNSUPPORTED_SHAPE); + } + /* + std::cout << "ndim = " << in->ndim << std::endl; + std::cout << " 0 = " << in->shape[0] << std::endl; + std::cout << " 1 = " << in->shape[1] << std::endl; + std::cout << " 2 = " << in->shape[2] << std::endl; + */ + BF_ASSERT( in->shape[0] == plan->nxyz(), BF_STATUS_INVALID_SHAPE); + if( plan->polmajor() ) { + BF_ASSERT( in->shape[1] == plan->npol(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT( in->shape[2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE); + } else { + BF_ASSERT( in->shape[1] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT( in->shape[2] == plan->npol(), BF_STATUS_INVALID_SHAPE); + } + + BFarray out_flattened; + if( out->ndim > 4 ) { + // Keep the last three dim but attempt to flatten all others + unsigned long keep_dims_mask = padded_dims_mask(out); + keep_dims_mask |= 0x1; + keep_dims_mask |= 0x1 << (out->ndim-1); + keep_dims_mask |= 0x1 << (out->ndim-2); + keep_dims_mask |= 0x1 << (out->ndim-3); + keep_dims_mask |= 0x1 << (out->ndim-4); + flatten(out, &out_flattened, keep_dims_mask); + out = &out_flattened; + BF_ASSERT(out_flattened.ndim == 5, BF_STATUS_UNSUPPORTED_SHAPE); + } + /* + std::cout << "ndim = " << out->ndim << std::endl; + std::cout << " 0 = " << out->shape[0] << std::endl; + std::cout << " 1 = " << out->shape[1] << std::endl; + std::cout << " 2 = " << out->shape[2] << std::endl; + std::cout << " 3 = " << out->shape[3] << std::endl; + */ + BF_ASSERT(out->shape[0] == plan->nplane(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT(out->shape[1] == plan->nxyz(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT(out->shape[2] == plan->npol(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT(out->shape[3] == plan->gridsize(), BF_STATUS_INVALID_SHAPE); + BF_ASSERT(out->shape[4] == plan->gridsize(), BF_STATUS_INVALID_SHAPE); + + BF_ASSERT(out->dtype == plan->tkernel(), BF_STATUS_UNSUPPORTED_DTYPE); + + BF_ASSERT(space_accessible_from( in->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + BF_ASSERT(space_accessible_from(out->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE); + BF_TRY_RETURN(plan->execute(in, out)); +} + +BFstatus bfOrvilleDestroy(BForville plan) { + BF_TRACE(); + BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE); + delete plan; + return BF_STATUS_SUCCESS; +} diff --git a/src/romein_kernels.cuh b/src/romein_kernels.cuh new file mode 100644 index 000000000..c89b09b72 --- /dev/null +++ b/src/romein_kernels.cuh @@ -0,0 +1,42 @@ +//CUDA Includes +#include +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "math.h" +#include + +//Romein Include +#include "bifrost/romein.h" + +/***************************** + Device Functions +******************************/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + + +#else //Pre-pascal devices. + +__device__ inline double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#endif + +__global__ void scatter_grid_kernel(cuComplex* fdata, + cuComplex* uvgrid, //Our UV-Grid + cuComplex* illum, // Illumination Pattern + int* x, + int* y, + int* z, + int max_support, // Convolution size + int grid_size, + int data_size); \ No newline at end of file From a9378469c46426ac5fbf7a2c19080d3413cd74ef Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Mon, 8 Jul 2019 11:23:23 -0600 Subject: [PATCH 02/11] Updated the Makefile to build orville.cu. --- src/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 793b613f9..6885bf86e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,6 +15,8 @@ LIBBIFROST_OBJS = \ address.o \ udp_socket.o \ udp_capture.o \ + packet_writer.o \ + disk_writer.o \ udp_transmit.o \ unpack.o \ quantize.o \ @@ -33,7 +35,8 @@ ifndef NOCUDA reduce.o \ fir.o \ guantize.o \ - gunpack.o + gunpack.o \ + orville.o endif JIT_SOURCES ?= \ From 7aaaa76c9bd14c1170fb692171527abe1be7018c Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Tue, 9 Jul 2019 16:09:12 -0600 Subject: [PATCH 03/11] Removed a few debugging statements from orville.py. --- python/bifrost/orville.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index 0bba31dfe..84fd79c7a 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -195,8 +195,6 @@ def execute(self, idata, odata): _check( _bf.bfOrvilleExecute(self.obj, asarray(idata).as_BFarray(), asarray(self._subgrid).as_BFarray()) ) - junk = self._subgrid.copy(space='system') - print junk.max() if self._subgrid.shape[0] > 1: # W project @@ -219,8 +217,6 @@ def execute(self, idata, odata): self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize) self._fft.execute(self._subgrid, self._subgrid, inverse=False) self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize) - junk = self._subgrid.copy(space='system') - print 'post-W', junk.max() # Stack reduce(self._subgrid, self._stcgrid, op='sum') @@ -237,16 +233,12 @@ def execute(self, idata, odata): self._fft = Fft() self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2)) self._fft.execute(self._stcgrid, self._stcgrid, inverse=True) - junk = self._stcgrid.copy(space='system') - print 'post-IFFT', junk.max() - + ## Correct for the kernel map('img(c,i,j) /= corr(i)*corr(j)', {'img':self._stcgrid, 'corr':self._img_correction}, axis_names=('c','i','j'), shape=self._stcgrid.shape) - junk = self._stcgrid.copy(space='system') - print 'post-corr', junk.max() - + # Shift and accumulate onto odata oshape = odata.shape odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) From eec391a1a3f679b6546d74788d2bcf61735a9516 Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Fri, 22 Nov 2019 10:30:32 -0700 Subject: [PATCH 04/11] Fixed a bug in orville.py that caused two different FFT plans to be treated as the same. --- python/bifrost/orville.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index 84fd79c7a..bf0e8e1c0 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -77,7 +77,8 @@ def __del__(self): del self._subgrid del self._stcgrid del self._w_kernels - del self._fft + del self._fft_im + del self._fft_wp except AttributeError: pass @@ -127,10 +128,6 @@ def _set_image_correction_kernel(self, force=False): corr = numpy.fft.ifft(corr).real corr = numpy.fft.fftshift(corr) corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample - #import pylab - #pylab.plot(corr) - #pylab.plot(1/corr) - #pylab.show() corr = numpy.fft.fftshift(corr) self._img_correction = ndarray(corr, dtype='f32', space='cuda') @@ -201,11 +198,11 @@ def execute(self, idata, odata): ## FFT self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize) try: - self._fft.execute(self._subgrid, self._subgrid, inverse=True) + self._fft_wp.execute(self._subgrid, self._subgrid, inverse=True) except AttributeError: - self._fft = Fft() - self._fft.init(self._subgrid, self._subgrid, axes=(1,2)) - self._fft.execute(self._subgrid, self._subgrid, inverse=True) + self._fft_wp = Fft() + self._fft_wp.init(self._subgrid, self._subgrid, axes=(1,2)) + self._fft_wp.execute(self._subgrid, self._subgrid, inverse=True) self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize) ## Project @@ -215,7 +212,7 @@ def execute(self, idata, odata): ## IFFT self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize) - self._fft.execute(self._subgrid, self._subgrid, inverse=False) + self._fft_wp.execute(self._subgrid, self._subgrid, inverse=False) self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize) # Stack @@ -228,11 +225,11 @@ def execute(self, idata, odata): ## IFFT self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize) try: - self._fft.execute(self._stcgrid, self._stcgrid, inverse=True) + self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True) except AttributeError: - self._fft = Fft() - self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2)) - self._fft.execute(self._stcgrid, self._stcgrid, inverse=True) + self._fft_im = Fft() + self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2)) + self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True) ## Correct for the kernel map('img(c,i,j) /= corr(i)*corr(j)', From 95afddc3c02e67640ffb661eb6afde8863652a4d Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 27 Nov 2019 09:34:49 -0700 Subject: [PATCH 05/11] Added a nearest neighbor kernel sampler as an alternative to the interpolation. Cleaned up some of the orville.cu macro names. --- src/orville.cu | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/src/orville.cu b/src/orville.cu index c4abbae79..f606995db 100644 --- a/src/orville.cu +++ b/src/orville.cu @@ -46,7 +46,11 @@ Implements the Romein convolutional algorithm onto a GPU using CUDA. #include -#define MAX_W_PLANES 128 +// Maximum number of w planes to allow +#define ORVILLE_MAX_W_PLANES 128 + +// Use bilinear interpolation for the gridding kernel rather than nearest neighbor +#define ORVILLE_USE_KERNEL_INTERP struct __attribute__((aligned(1))) nibble2 { // Yikes! This is dicey since the packing order is implementation dependent! @@ -119,6 +123,20 @@ inline OutType WeightAndInterp2D(OutType w, } +template +__device__ +inline OutType WeightAndNearestNeighbor2D(OutType w, + cudaTextureObject_t kernel_tex, + IndexType x, + IndexType y, + int size) { + OutType kxnn = fetch_cf(kernel_tex, (int) round(x)); + OutType kynn = fetch_cf(kernel_tex, (int) round(y)); + + return w*knn*knn; +} + + template __global__ void orville_kernel_spln(int nbaseline, int npol, @@ -241,7 +259,11 @@ __global__ void orville_kernel_spln(int nbaseline, } //TODO: Re-do the w-kernel/gcf for our data. +#ifdef ORVILLE_USE_KERNEL_INTERP OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); +#else + OutType px = WeightAndNearestNeighbor2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); +#endif // Sum up InType temp = d_in[vi+vi_s]; OutType vi_v = OutType(temp.x, temp.y); @@ -371,7 +393,11 @@ __global__ void orville_kernel_sloc(int nbaseline, } //TODO: Re-do the w-kernel/gcf for our data. +#ifdef ORVILLE_USE_KERNEL_INTERP OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); +#else + OutType px = WeightAndNearestNeighbor2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample); +#endif // Sum up InType temp = d_in[vi+vi_s]; OutType vi_v = OutType(temp.x, temp.y); @@ -603,9 +629,9 @@ public: ALIGNMENT_ELMTS = ALIGNMENT_BYTES / sizeof(float) }; Workspace workspace(ALIGNMENT_BYTES); - _plan_stride = round_up(MAX_W_PLANES+1, ALIGNMENT_ELMTS); - workspace.reserve(MAX_W_PLANES+1, &_planes); - workspace.reserve(MAX_W_PLANES+1, &_midpoints); + _plan_stride = round_up(ORVILLE_MAX_W_PLANES+1, ALIGNMENT_ELMTS); + workspace.reserve(ORVILLE_MAX_W_PLANES+1, &_planes); + workspace.reserve(ORVILLE_MAX_W_PLANES+1, &_midpoints); if( storage_size ) { if( !storage_ptr ) { // Return required storage size @@ -666,8 +692,8 @@ public: int count = 1; float* planes; float* midpoints; - planes = (float*) malloc((MAX_W_PLANES+1)*sizeof(float)); - midpoints = (float*) malloc((MAX_W_PLANES+1)*sizeof(float)); + planes = (float*) malloc((ORVILLE_MAX_W_PLANES+1)*sizeof(float)); + midpoints = (float*) malloc((ORVILLE_MAX_W_PLANES+1)*sizeof(float)); *(planes + 0) = *(zsorted + 0); *(midpoints + 0) = *(zsorted + 0); for(int i=1; i Date: Wed, 27 Nov 2019 09:45:22 -0700 Subject: [PATCH 06/11] Fixed a typo. --- src/orville.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/orville.cu b/src/orville.cu index f606995db..47388b586 100644 --- a/src/orville.cu +++ b/src/orville.cu @@ -133,7 +133,7 @@ inline OutType WeightAndNearestNeighbor2D(OutType w, OutType kxnn = fetch_cf(kernel_tex, (int) round(x)); OutType kynn = fetch_cf(kernel_tex, (int) round(y)); - return w*knn*knn; + return w*kxnn*kynn; } From af65bc3ba86578cfa1a6c7975c09468ec13c89ee Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 27 Nov 2019 12:43:51 -0700 Subject: [PATCH 07/11] Merged the kernel correction and shift/accumulate map calls into one. --- python/bifrost/orville.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index bf0e8e1c0..2c525f87c 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -231,12 +231,28 @@ def execute(self, idata, odata): self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2)) self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True) - ## Correct for the kernel - map('img(c,i,j) /= corr(i)*corr(j)', - {'img':self._stcgrid, 'corr':self._img_correction}, - axis_names=('c','i','j'), shape=self._stcgrid.shape) - - # Shift and accumulate onto odata + ### Correct for the kernel + #map('img(c,i,j) /= corr(i)*corr(j)', + # {'img':self._stcgrid, 'corr':self._img_correction}, + # axis_names=('c','i','j'), shape=self._stcgrid.shape) + # + ## Shift and accumulate onto odata + #oshape = odata.shape + #odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) + #padding = self._gridsize - self._origsize + #offset = padding/2# - padding%2 + #map(""" + # auto k = i + {offset} - {gridsize}/2; + # auto l = j + {offset} - {gridsize}/2; + # if( k < 0 ) k += {gridsize}; + # if( l < 0 ) l += {gridsize}; + # odata(c,i,j) += idata(c,k,l).real; + # """.format(gridsize=self._gridsize, offset=offset), + # {'odata':odata, 'idata':self._stcgrid}, + # axis_names=('c','i','j'), shape=odata.shape) + #odata = odata.reshape(oshape) + + # Correct for the kernel, shift, and accumulate onto odata oshape = odata.shape odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) padding = self._gridsize - self._origsize @@ -246,9 +262,9 @@ def execute(self, idata, odata): auto l = j + {offset} - {gridsize}/2; if( k < 0 ) k += {gridsize}; if( l < 0 ) l += {gridsize}; - odata(c,i,j) += idata(c,k,l).real; + odata(c,i,j) += idata(c,k,l).real / (corr(k) * corr(l)); """.format(gridsize=self._gridsize, offset=offset), - {'odata':odata, 'idata':self._stcgrid}, + {'odata':odata, 'idata':self._stcgrid, 'corr':self._img_correction}, axis_names=('c','i','j'), shape=odata.shape) odata = odata.reshape(oshape) From a4a47ba7c445ddb3268e8d516ab72f3975930267 Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 2 Jun 2021 16:23:52 -0600 Subject: [PATCH 08/11] Python3 changes. --- python/bifrost/orville.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index 2c525f87c..5b2fcba0c 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, The Bifrost Authors. All rights reserved. +# Copyright (c) 2018-2021, The Bifrost Authors. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,8 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from libbifrost import _bf, _check, _get, BifrostObject, _string2space -from ndarray import ndarray, zeros, asarray, memset_array, copy_array +# Python2 compatibility +from __future__ import absolute_import, print_function, division + +from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space +from bifrost.ndarray import ndarray, zeros, asarray, memset_array, copy_array from bifrost.fft import Fft from bifrost.map import map @@ -106,7 +109,7 @@ def _get_gridding_setup(self, force=False): self._npol = npol self._nplane = nplane self._midpoints = midpoints - print "Working with %i planes" % self._nplane + print("Working with %i planes" % self._nplane) def _set_image_correction_kernel(self, force=False): self._get_gridding_setup(force=force) @@ -119,21 +122,21 @@ def _set_image_correction_kernel(self, force=False): # Get the new image correction kernel corr = numpy.zeros(self._gridsize*self._oversample, dtype=self._dtype) - c = corr.size/2 - d = self._kernel.size/2 + c = corr.size//2 + d = self._kernel.size//2 corr[c-d:c+d] = self._kernel.copy(space='system') corr[:c-d] = numpy.arange(c-d) * corr[c-d]/(c-d) corr[c+d:] = corr[c-d] - numpy.arange(c-d) * corr[c-d]/(c-d) corr = numpy.fft.ifftshift(corr) corr = numpy.fft.ifft(corr).real corr = numpy.fft.fftshift(corr) - corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample + corr = corr[corr.size//2-self._gridsize//2-self._gridsize%2:corr.size//2+self._gridsize//2] * self._oversample corr = numpy.fft.fftshift(corr) self._img_correction = ndarray(corr, dtype='f32', space='cuda') def _get_lm(self, gridsize, gridres): m,l = numpy.indices((gridsize,gridsize)) - l,m = numpy.where(l > gridsize/2, gridsize-l, -l), numpy.where(m > gridsize/2, m-gridsize, m) + l,m = numpy.where(l > gridsize//2, gridsize-l, -l), numpy.where(m > gridsize//2, m-gridsize, m) l,m = l.astype(numpy.float32)/gridsize/gridres, m.astype(numpy.float32)/gridsize/gridres return l,m From e28e88275910c6643e991375707fdfaeb6c6eda1 Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 8 Jan 2025 09:23:21 -0700 Subject: [PATCH 09/11] Add support for uniform and Briggs weighting. --- python/bifrost/orville.py | 71 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index 5b2fcba0c..f1edd932c 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, The Bifrost Authors. All rights reserved. +# Copyright (c) 2018-2025, The Bifrost Authors. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,7 +28,7 @@ from __future__ import absolute_import, print_function, division from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space -from bifrost.ndarray import ndarray, zeros, asarray, memset_array, copy_array +from bifrost.ndarray import ndarray, zeros, empty_like, asarray, memset_array, copy_array from bifrost.fft import Fft from bifrost.map import map @@ -41,7 +41,7 @@ from bifrost.device import stream_synchronize as BFSync -class Orville(BifrostObject): +class Orville2(BifrostObject): def __init__(self): BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy) def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'): @@ -70,6 +70,7 @@ def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, over self._oversample = oversample self._dtype = kernel.dtype self._space = space + self._update_beam = True # Build the image correction kernel and the gridding kernels self._set_image_correction_kernel() @@ -173,20 +174,45 @@ def set_positions(self, positions): _check( _bf.bfOrvilleSetPositions(self.obj, asarray(positions).as_BFarray()) ) self._set_projection_kernels(force=True) + self._update_beam = True def set_weights(self, weights): weights = numpy.fft.fftshift(weights) _check( _bf.bfOrvilleSetWeights(self.obj, asarray(weights).as_BFarray()) ) + self._update_beam = True def set_kernel(self, kernel): self._kernel = kernel _check( _bf.bfOrvilleSetKernel(self.obj, asarray(kernel).as_BFarray()) ) self._set_image_correction_kernel() + self._update_beam = True - def execute(self, idata, odata): + def execute(self, idata, odata, weighting='natural'): # TODO: Work out how to integrate CUDA stream + + if self._update_beam: + bidata = empty_like(idata) + map("bdata = Complex(1,0)", {'bdata': bidata}, shape=bidata.shape) + self._raw_beam = zeros(shape=(1,self._ntimechan,self._npol,self._gridsize,self._gridsize), + dtype=self._stcgrid.dtype, space='cuda') + + self._update_beam = False + try: + self._raw_beam = self.execute(bidata, self._raw_beam, weighting='_raw_beam') + except Exception as e: + self._update_beam = True + raise e + + rb = self._raw_beam.reshape(1,self._ntimechan,self._npol,self._gridsize*self._gridsize) + self._wavg = zeros(shape=(1,self._ntimechan,self._npol,1), + dtype='cf32', space='cuda') + reduce(rb, self._wavg, op='sum') + self._w2avg = zeros(shape=(1,self._ntimechan,self._npol,1), + dtype='f32', space='cuda') + reduce(rb, self._w2avg, op='pwrsum') + # Zero out the subgrids memset_array(self._subgrid, 0) self._stcgrid = self._stcgrid.reshape(1,self._ntimechan,self._npol,self._gridsize,self._gridsize) @@ -224,6 +250,41 @@ def execute(self, idata, odata): else: copy_array(self._stcgrid, self._subgrid) + # Apply the weighting + if weighting == '_raw_beam': + copy_array(odata, self._stcgrid) + return odata + + elif weighting == 'natural': + pass + + elif weighting == 'uniform': + map(""" + auto weight = bdata(0,i,j,k,l).real; + if( weight != 0.0 ) { + idata(0,i,j,k,l) = idata(0,i,j,k,l) / weight; + } + """, + {'idata': self._stcgrid, 'bdata': self._raw_beam}, + axis_names=('i','j','k','l'), shape=self._stcgrid.shape[1:] + ) + + elif weighting.startswith('briggs'): + R = float(weighting.split('@', 1)[1]) + map(""" + auto weight = bdata(0,i,j,k,l).real; + auto S2 = 25.0 * powf(10.0, -2*{R}) / (w2avg(0,i,j,0) / wavg(0,i,j,0)); + if( weight != 0.0 ) {{ + idata(0,i,j,k,l) = idata(0,i,j,k,l) / (1.0 + weight * S2); + }} + """.format(R=R), + {'idata': self._stcgrid, 'bdata': self._raw_beam, 'wavg': self._wavg, 'w2avg': self._w2avg}, + axis_names=('i','j','k','l'), shape=self._stcgrid.shape[1:] + ) + + else: + raise ValueError("Unknown weighting '%s'" % weighting) + # (u,v) plane -> image ## IFFT self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize) @@ -259,7 +320,7 @@ def execute(self, idata, odata): oshape = odata.shape odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) padding = self._gridsize - self._origsize - offset = padding/2# - padding%2 + offset = padding//2# - padding%2 map(""" auto k = i + {offset} - {gridsize}/2; auto l = j + {offset} - {gridsize}/2; From 3e39288d8def86cf1d60c13cc34c89416a8888be Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 8 Jan 2025 09:24:15 -0700 Subject: [PATCH 10/11] Orville2 -> Orville --- python/bifrost/orville.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index f1edd932c..272a22f96 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -41,7 +41,7 @@ from bifrost.device import stream_synchronize as BFSync -class Orville2(BifrostObject): +class Orville(BifrostObject): def __init__(self): BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy) def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'): From e018626f085e9577071c171059209c612ea69f40 Mon Sep 17 00:00:00 2001 From: jaycedowell Date: Wed, 8 Jan 2025 09:57:03 -0700 Subject: [PATCH 11/11] Drop Py2 stuff and remove commented out code. --- python/bifrost/orville.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py index 272a22f96..891a881a9 100644 --- a/python/bifrost/orville.py +++ b/python/bifrost/orville.py @@ -24,9 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Python2 compatibility -from __future__ import absolute_import, print_function, division - from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space from bifrost.ndarray import ndarray, zeros, empty_like, asarray, memset_array, copy_array @@ -295,27 +292,6 @@ def execute(self, idata, odata, weighting='natural'): self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2)) self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True) - ### Correct for the kernel - #map('img(c,i,j) /= corr(i)*corr(j)', - # {'img':self._stcgrid, 'corr':self._img_correction}, - # axis_names=('c','i','j'), shape=self._stcgrid.shape) - # - ## Shift and accumulate onto odata - #oshape = odata.shape - #odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1]) - #padding = self._gridsize - self._origsize - #offset = padding/2# - padding%2 - #map(""" - # auto k = i + {offset} - {gridsize}/2; - # auto l = j + {offset} - {gridsize}/2; - # if( k < 0 ) k += {gridsize}; - # if( l < 0 ) l += {gridsize}; - # odata(c,i,j) += idata(c,k,l).real; - # """.format(gridsize=self._gridsize, offset=offset), - # {'odata':odata, 'idata':self._stcgrid}, - # axis_names=('c','i','j'), shape=odata.shape) - #odata = odata.reshape(oshape) - # Correct for the kernel, shift, and accumulate onto odata oshape = odata.shape odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])