From 25bca57efa970c25a703aebf611bedb96d199fab Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Mon, 8 Jul 2019 11:23:07 -0600
Subject: [PATCH 01/11] Initial commit of the Project Orville wideband gridder
 and imager.

---
 python/bifrost/orville.py |  266 +++++++++
 src/bifrost/orville.h     |   53 ++
 src/bifrost/romein.h      |   51 ++
 src/orville.cu            | 1095 +++++++++++++++++++++++++++++++++++++
 src/romein_kernels.cuh    |   42 ++
 5 files changed, 1507 insertions(+)
 create mode 100644 python/bifrost/orville.py
 create mode 100644 src/bifrost/orville.h
 create mode 100644 src/bifrost/romein.h
 create mode 100644 src/orville.cu
 create mode 100644 src/romein_kernels.cuh

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
new file mode 100644
index 000000000..0bba31dfe
--- /dev/null
+++ b/python/bifrost/orville.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2018, The Bifrost Authors. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of The Bifrost Authors nor the names of its
+#   contributors may be used to endorse or promote products derived
+#   from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from libbifrost import _bf, _check, _get, BifrostObject, _string2space
+from ndarray import ndarray, zeros, asarray, memset_array, copy_array
+
+from bifrost.fft import Fft
+from bifrost.map import map
+from bifrost.reduce import reduce
+
+import ctypes
+import numpy
+
+import time
+
+from bifrost.device import stream_synchronize as BFSync
+
+class Orville(BifrostObject):
+    def __init__(self):
+        BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy)
+    def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'):
+        bspace = _string2space(space)
+        psize = None
+        _check( _bf.bfOrvilleInit(self.obj, 
+                                  asarray(positions).as_BFarray(), 
+                                  asarray(weights).as_BFarray(), 
+                                  asarray(kernel).as_BFarray(),
+                                  gridsize,
+                                  gridres,
+                                  gridwres,
+                                  oversample,
+                                  polmajor,
+                                  bspace,
+                                  0,
+                                  psize) )
+        
+        # Cache the kernel for later
+        self._kernel = kernel
+        
+        # Cache some metdata for later
+        self._origsize = gridsize
+        self._res = gridres
+        self._maxsupport = kernel.size/oversample
+        self._oversample = oversample
+        self._dtype = kernel.dtype
+        self._space = space
+        
+        # Build the image correction kernel and the gridding kernels
+        self._set_image_correction_kernel()
+        self._set_projection_kernels()
+        
+    def __del__(self):
+        try:
+            del self._subgrid
+            del self._stcgrid
+            del self._w_kernels
+            del self._fft
+        except AttributeError:
+            pass
+        
+        BifrostObject.__del__(self)
+        
+    def _get_gridding_setup(self, force=False):
+        try:
+            self._ntimechan
+            if force:
+                raise AttributeError
+        except AttributeError:
+            # Get the  projection setup information
+            ntimechan, gridsize, npol, nplane = ctypes.c_int(0), ctypes.c_int(0), ctypes.c_int(0), ctypes.c_int(0)
+            midpoints = zeros(128, dtype='f32', space='system')
+            _check( _bf.bfOrvilleGetProjectionSetup(self.obj,
+                                                    ctypes.byref(ntimechan),
+                                                    ctypes.byref(gridsize),
+                                                    ctypes.byref(npol),
+                                                    ctypes.byref(nplane),
+                                                    asarray(midpoints).as_BFarray()) )
+            ntimechan, gridsize, npol, nplane = ntimechan.value, gridsize.value, npol.value, nplane.value
+            midpoints = midpoints[:nplane]
+            self._ntimechan = ntimechan
+            self._gridsize = gridsize
+            self._npol = npol
+            self._nplane = nplane
+            self._midpoints = midpoints
+            print "Working with %i planes" % self._nplane
+            
+    def _set_image_correction_kernel(self, force=False):
+        self._get_gridding_setup(force=force)
+        
+        # Cleanup the old image correction kernel
+        try:
+            del self._img_correction
+        except AttributeError:
+            pass
+            
+        # Get the new image correction kernel
+        corr = numpy.zeros(self._gridsize*self._oversample, dtype=self._dtype)
+        c = corr.size/2
+        d = self._kernel.size/2
+        corr[c-d:c+d] = self._kernel.copy(space='system')
+        corr[:c-d] = numpy.arange(c-d) * corr[c-d]/(c-d)
+        corr[c+d:] = corr[c-d] - numpy.arange(c-d) * corr[c-d]/(c-d)
+        corr = numpy.fft.ifftshift(corr)
+        corr = numpy.fft.ifft(corr).real
+        corr = numpy.fft.fftshift(corr)
+        corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample
+        #import pylab
+        #pylab.plot(corr)
+        #pylab.plot(1/corr)
+        #pylab.show()
+        corr = numpy.fft.fftshift(corr)
+        self._img_correction = ndarray(corr, dtype='f32', space='cuda')
+        
+    def _get_lm(self, gridsize, gridres):
+        m,l = numpy.indices((gridsize,gridsize))
+        l,m = numpy.where(l > gridsize/2, gridsize-l, -l), numpy.where(m > gridsize/2, m-gridsize, m)
+        l,m = l.astype(numpy.float32)/gridsize/gridres, m.astype(numpy.float32)/gridsize/gridres
+        return l,m
+        
+    def _set_projection_kernels(self):
+        self._get_gridding_setup()
+        
+        # Set the sub and stacked grids
+        try:
+            del self._subgrid
+            del self._stcgrid
+        except AttributeError:
+            pass
+        self._subgrid = zeros((self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize), 
+                              dtype=self._dtype, 
+                              space=self._space)
+        self._stcgrid = zeros((1,self._ntimechan,self._npol,self._gridsize,self._gridsize), 
+                              dtype=self._dtype, 
+                              space=self._space)
+        
+        # Set the w projection kernels
+        try:
+            del self._w_kernels
+        except AttributeError:
+            pass
+        w_kernels = numpy.zeros((self._nplane,self._gridsize,self._gridsize), dtype=self._dtype)
+        l,m = self._get_lm(self._gridsize, self._res)
+        theta = numpy.sqrt(1.0 + 0.0j - l**2 - m**2)
+        for i,avg_w in enumerate(self._midpoints):
+            sub_w_kernel = numpy.exp(2j*numpy.pi*avg_w*(1.0-theta)) / self._gridsize**2 / self._gridsize**2
+            w_kernels[i,:,:] = sub_w_kernel
+        self._w_kernels = ndarray(w_kernels, space=self._space)
+        
+    def set_positions(self, positions):
+        _check( _bf.bfOrvilleSetPositions(self.obj, 
+                                          asarray(positions).as_BFarray()) )
+        self._set_projection_kernels(force=True)
+        
+    def set_weights(self, weights):
+        weights = numpy.fft.fftshift(weights)
+        _check( _bf.bfOrvilleSetWeights(self.obj, 
+                                       asarray(weights).as_BFarray()) )
+        
+    def set_kernel(self, kernel):
+        self._kernel = kernel
+        _check( _bf.bfOrvilleSetKernel(self.obj, 
+                                       asarray(kernel).as_BFarray()) )
+        self._set_image_correction_kernel()
+        
+    def execute(self, idata, odata):
+        # TODO: Work out how to integrate CUDA stream
+        # Zero out the subgrids
+        memset_array(self._subgrid, 0)
+        self._stcgrid = self._stcgrid.reshape(1,self._ntimechan,self._npol,self._gridsize,self._gridsize)
+        
+        # Grid
+        _check( _bf.bfOrvilleExecute(self.obj,
+                                     asarray(idata).as_BFarray(),
+                                     asarray(self._subgrid).as_BFarray()) )
+        junk = self._subgrid.copy(space='system')
+        print junk.max()
+        
+        if self._subgrid.shape[0] > 1:
+            # W project
+            ## FFT
+            self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize)
+            try:
+                self._fft.execute(self._subgrid, self._subgrid, inverse=True)
+            except AttributeError:
+                self._fft = Fft()
+                self._fft.init(self._subgrid, self._subgrid, axes=(1,2))
+                self._fft.execute(self._subgrid, self._subgrid, inverse=True)
+            self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize)
+                
+            ## Project
+            map('a(w,c,p,i,j) *= b(w,i,j)', 
+                {'a':self._subgrid, 'b':self._w_kernels}, 
+                axis_names=('w','c','p','i','j'), shape=self._subgrid.shape)
+            
+            ## IFFT
+            self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize)
+            self._fft.execute(self._subgrid, self._subgrid, inverse=False)
+            self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize)
+            junk = self._subgrid.copy(space='system')
+            print 'post-W', junk.max()
+            
+            # Stack
+            reduce(self._subgrid, self._stcgrid, op='sum')
+            
+        else:
+            copy_array(self._stcgrid, self._subgrid)
+            
+        # (u,v) plane -> image
+        ## IFFT
+        self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize)
+        try:
+            self._fft.execute(self._stcgrid, self._stcgrid, inverse=True)
+        except AttributeError:
+            self._fft = Fft()
+            self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2))
+            self._fft.execute(self._stcgrid, self._stcgrid, inverse=True)
+        junk = self._stcgrid.copy(space='system')
+        print 'post-IFFT', junk.max()
+        
+        ## Correct for the kernel
+        map('img(c,i,j) /= corr(i)*corr(j)',
+            {'img':self._stcgrid, 'corr':self._img_correction},
+            axis_names=('c','i','j'), shape=self._stcgrid.shape)
+        junk = self._stcgrid.copy(space='system')
+        print 'post-corr', junk.max()
+        
+        # Shift and accumulate onto odata
+        oshape = odata.shape
+        odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])
+        padding = self._gridsize - self._origsize
+        offset = padding/2# - padding%2
+        map("""
+            auto k = i + {offset} - {gridsize}/2;
+            auto l = j + {offset} - {gridsize}/2;
+            if( k < 0 ) k += {gridsize};
+            if( l < 0 ) l += {gridsize};
+            odata(c,i,j) += idata(c,k,l).real;
+            """.format(gridsize=self._gridsize, offset=offset), 
+            {'odata':odata, 'idata':self._stcgrid}, 
+            axis_names=('c','i','j'), shape=odata.shape)
+        odata = odata.reshape(oshape)
+        
+        return odata
diff --git a/src/bifrost/orville.h b/src/bifrost/orville.h
new file mode 100644
index 000000000..dc9bf5c26
--- /dev/null
+++ b/src/bifrost/orville.h
@@ -0,0 +1,53 @@
+#ifndef BF_ORVILLE_H_INCLUDE_GUARD_
+#define BF_ORVILLE_INCLUDE_GUARD_
+
+// Bifrost Includes
+#include <bifrost/common.h>
+#include <bifrost/memory.h>
+#include <bifrost/array.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BForville_impl* BForville;
+
+BFstatus bfOrvilleCreate(BForville* plan);
+BFstatus bfOrvilleInit(BForville       plan,
+                       BFarray const* positions,
+                       BFarray const* weights,
+                       BFarray const* kernel1D,
+                       BFsize         gridsize,
+                       double         gridres,
+                       double         gridwres,
+                       BFsize         oversample,
+                       BFbool         polmajor,
+                       BFspace        space,
+                       void*          plan_storage,
+                       BFsize*        plan_storage_size);
+BFstatus bfOrvilleSetStream(BForville  plan,
+                           void const* stream);
+BFstatus bfOrvilleGetStream(BForville plan,
+                            void*     stream);
+BFstatus bfOrvilleSetPositions(BForville     plan, 
+                              BFarray const* positions);
+BFstatus bfOrvilleGetProjectionSetup(BForville      plan,
+                                     int*           ntimechan,
+                                     int*           gridsize,
+                                     int*           npol,
+                                     int*           nplane,
+                                     BFarray const* midpoints);
+BFstatus bfOrvilleSetWeights(BForville      plan, 
+                             BFarray const* weights);
+BFstatus bfOrvilleSetKernel(BForville      plan, 
+                            BFarray const* kernel1D);
+BFstatus bfOrvilleExecute(BForville      plan,
+                          BFarray const* in,
+                          BFarray const* out);
+BFstatus bfOrvilleDestroy(BForville plan);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BF_ORVILLE_H_INCLUDE_GUARD
diff --git a/src/bifrost/romein.h b/src/bifrost/romein.h
new file mode 100644
index 000000000..0f0cedc73
--- /dev/null
+++ b/src/bifrost/romein.h
@@ -0,0 +1,51 @@
+#ifndef BF_ROMEIN_H_INCLUDE_GUARD_
+#define BF_ROMEIN_H_INCLUDE_GUARD_
+
+// Bifrost Includes
+#include <bifrost/common.h>
+#include <bifrost/memory.h>
+#include <bifrost/array.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BFromein_impl* BFromein;
+
+BFstatus bfRomeinCreate(BFromein* plan);
+BFstatus bfRomeinInit(BFromein       plan,
+                      BFarray const* positions,
+                      BFarray const* kernels,
+                      BFsize         ngrid,
+                      BFbool         polmajor);
+BFstatus bfRomeinSetStream(BFromein    plan,
+                           void const* stream);
+BFstatus bfRomeinSetPositions(BFromein       plan, 
+                              BFarray const* positions);
+BFstatus bfRomeinSetKernels(BFromein       plan, 
+                            BFarray const* kernels);
+BFstatus bfRomeinExecute(BFromein          plan,
+                         BFarray const* in,
+                         BFarray const* out);
+BFstatus bfRomeinDestroy(BFromein plan);
+
+/*****************************
+    Host Functions
+ *****************************/
+
+BFstatus romein_float(BFarray const* data,
+		      BFarray const* uvgrid,
+		      BFarray const* illum,
+		      BFarray const* data_xloc,
+		      BFarray const* data_yloc,
+		      BFarray const* data_zloc,
+		      int max_support,
+		      int grid_size,
+		      int data_size,
+		      int nbatch);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BF_ROMEIN_H_INCLUDE_GUARD
diff --git a/src/orville.cu b/src/orville.cu
new file mode 100644
index 000000000..c4abbae79
--- /dev/null
+++ b/src/orville.cu
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2018, The Bifrost Authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of The Bifrost Authors nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* 
+
+Implements the Romein convolutional algorithm onto a GPU using CUDA. 
+
+*/
+#include <iostream>
+#include <bifrost/orville.h>
+#include "romein_kernels.cuh"
+
+#include "assert.hpp"
+#include "trace.hpp"
+#include "utils.hpp"
+#include "workspace.hpp"
+#include "cuda.hpp"
+#include "cuda/stream.hpp"
+
+#include "Complex.hpp"
+
+#include <thrust/device_vector.h>
+
+#define MAX_W_PLANES  128
+
+struct __attribute__((aligned(1))) nibble2 {
+    // Yikes!  This is dicey since the packing order is implementation dependent!  
+    signed char y:4, x:4;
+};
+
+struct __attribute__((aligned(1))) blenib2 {
+    // Yikes!  This is dicey since the packing order is implementation dependent!
+    signed char x:4, y:4;
+};
+
+
+template<typename RealType>
+__host__ __device__
+inline Complex<RealType> Complexfcma(Complex<RealType> x, Complex<RealType> y, Complex<RealType> d) {
+    RealType real_res;
+    RealType imag_res;
+    
+    real_res = (x.x *  y.x) + d.x;
+    imag_res = (x.x *  y.y) + d.y;
+            
+    real_res =  (x.y * y.y) + real_res;  
+    imag_res = -(x.y * y.x) + imag_res;          
+     
+    return Complex<RealType>(real_res, imag_res);
+}
+
+
+template<typename OutType>
+__device__
+inline OutType fetch_cf(cudaTextureObject_t tex,
+                        int                 idx) {
+    return OutType(0);
+}
+
+template<>
+__device__
+inline Complex32 fetch_cf<Complex32>(cudaTextureObject_t tex,
+                                     int                 idx) {
+    float2 p = tex1Dfetch<float2>(tex, idx);
+    return Complex32(p.x, p.y);
+}
+
+template<>
+__device__
+inline Complex64 fetch_cf<Complex64>(cudaTextureObject_t tex,
+                                     int                 idx) {
+    uint4 p = tex1Dfetch<uint4>(tex, idx);
+    return Complex64(__hiloint2double(p.x, p.y),
+                     __hiloint2double(p.z, p.w));
+}
+    
+
+template<typename OutType, typename IndexType>
+__device__
+inline OutType WeightAndInterp2D(OutType             w,
+                                 cudaTextureObject_t kernel_tex, 
+                                 IndexType           x, 
+                                 IndexType           y, 
+                                 int                 size) {
+    OutType kx0 = fetch_cf<OutType>(kernel_tex, (int) x);
+    OutType kx1 = fetch_cf<OutType>(kernel_tex, (int) x + 1);
+    OutType ky0 = fetch_cf<OutType>(kernel_tex, (int) y);
+    OutType ky1 = fetch_cf<OutType>(kernel_tex, (int) y + 1);
+    
+    OutType kx = ((int)x + 1 - x)*kx0 + (x - (int)x)*kx1;
+    OutType ky = ((int)y + 1 - y)*ky0 + (y - (int)y)*ky1;
+    
+    return w*kx*ky;
+}
+
+
+template<typename InType, typename OutType>
+__global__ void orville_kernel_spln(int                         nbaseline,
+                                    int                         npol,
+                                    int                         maxsupport, 
+                                    int                         oversample,
+                                    int                         gridsize, 
+                                    double                      gridres,
+                                    int                         nbatch,
+                                    int                         nplane,
+                                    const float* __restrict__   planes,
+                                    const float* __restrict__   x,
+                                    const float* __restrict__   y,
+                                    const float* __restrict__   z,
+                                    const OutType* __restrict__ weight,
+                                    cudaTextureObject_t         kernel_tex,
+                                    const InType* __restrict__  d_in,
+                                    OutType*                    d_out) {
+    int batch_no = blockIdx.x;
+    int pol_no = threadIdx.y;
+    int vi_s = batch_no*nbaseline*npol+pol_no;
+    int grid_s = batch_no*npol*gridsize*gridsize + pol_no*gridsize*gridsize;
+    
+    extern __shared__ float shared[];
+    
+    float* zplanes = shared;
+    
+    for(int i = threadIdx.x; i < nplane+1; i += blockDim.x){
+        zplanes[i] = planes[i];
+    }
+
+    __syncthreads();
+    
+    for(int i = threadIdx.x; i < maxsupport * maxsupport; i += blockDim.x) {
+        int myU = i % maxsupport;
+        int myV = i / maxsupport;
+        
+        int grid_point_u = myU;
+        int grid_point_v = myV;
+        int grid_point_p = 0;
+        OutType sum = OutType(0.0, 0.0);
+
+        int vi, plane, ci_s;
+        vi = plane = 0;
+        ci_s = (vi + vi_s) / npol;
+        while(plane < nplane) {
+            if( z[ci_s] >= zplanes[plane] && z[ci_s] < zplanes[plane+1] ) {
+                break;
+            }
+            plane++;
+        }
+        
+        for(vi = 0; vi < (nbaseline*npol); vi+=npol) {
+            ci_s = (vi + vi_s) / npol;
+            //if( x[ci_s]/gridres < -gridsize/2 ) continue;
+            //if( x[ci_s]/gridres > gridsize/2 ) continue;
+            //if( y[ci_s]/gridres < -gridsize/2 ) continue;
+            //if( y[ci_s]/gridres > gridsize/2 ) continue;
+            float xf = x[ci_s]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2);
+            int xl = (int) round(xf);
+            float yf = y[ci_s]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2);
+            int yl = (int) round(yf);
+            
+            // Find the correct z-plane to grid to
+            if( z[ci_s] < zplanes[plane] || z[ci_s] >= zplanes[plane+1] ) {
+                plane = 0;
+                while(plane < nplane) {
+                    if( z[ci_s] >= zplanes[plane] && z[ci_s] < zplanes[plane+1] ) {
+                        break;
+                    }
+                    plane++;
+                }
+            }
+            
+            // Determine convolution point. This is basically just an
+            // optimised way to calculate.
+            //int myConvU = myU - u;
+            //int myConvV = myV - v;
+            int myConvU = 0;
+            int myConvV = 0;
+            if( maxsupport > 1 ) {
+                myConvU = (xl - myU) % maxsupport;
+                myConvV = (yl - myV) % maxsupport;    
+                if (myConvU < 0) myConvU += maxsupport;
+                if (myConvV < 0) myConvV += maxsupport;
+            }
+            float myConvUO = myConvU*oversample - ((xf - xl)*oversample) + oversample/2 - (1 - oversample%2);
+            float myConvVO = myConvV*oversample - ((yf - yl)*oversample) + oversample/2 - (1 - oversample%2);
+            if (myConvUO < 0) myConvUO += maxsupport*oversample;
+            if (myConvVO < 0) myConvVO += maxsupport*oversample;
+            
+            // Determine grid point. Because of the above we know here that
+            //   myGridU % max_supp = myU
+            //   myGridV % max_supp = myV
+            int myGridU = xl + myConvU - gridsize/2;
+            int myGridV = yl + myConvV - gridsize/2;
+//             if( myGridU <= -gridsize/2 ) continue;
+//             if( myGridV <= -gridsize/2 ) continue;
+//             if( myGridU >= gridsize/2 ) continue;
+//             if( myGridU >= gridsize/2 ) continue;
+            //if( myGridU + gridsize/2 >= gridsize - maxsupport/2 ) continue;
+            //if( myGridV + gridsize/2 >= gridsize - maxsupport/2 ) continue;
+            if( myGridU < 0 ) myGridU += gridsize;
+            if( myGridV < 0 ) myGridV += gridsize;
+            
+            // Grid point changed?
+            if (myGridU == grid_point_u && myGridV == grid_point_v && plane == grid_point_p) {
+                // Nothin'
+            } else {
+                // Atomically add to grid. This is the bottleneck of this kernel.
+                if( grid_point_u >= 0 && grid_point_u < gridsize && \
+                    grid_point_v >= 0 && grid_point_v < gridsize ) {
+                    atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x);
+                    atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y);
+                }
+                // Switch to new point
+                sum = OutType(0.0, 0.0);
+                grid_point_u = myGridU;
+                grid_point_v = myGridV;
+                grid_point_p = plane;
+            }
+            
+            //TODO: Re-do the w-kernel/gcf for our data.
+            OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+            // Sum up
+            InType temp = d_in[vi+vi_s];
+            OutType vi_v = OutType(temp.x, temp.y);
+            sum = Complexfcma(px, vi_v, sum);
+        }
+        
+        if( grid_point_u >= 0 && grid_point_u < gridsize && \
+            grid_point_v >= 0 && grid_point_v < gridsize ) {
+            atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x);
+            atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y);
+        }
+    }
+}
+
+
+template<typename InType, typename OutType>
+__global__ void orville_kernel_sloc(int                         nbaseline,
+                                    int                         npol,
+                                    int                         maxsupport, 
+                                    int                         oversample,
+                                    int                         gridsize, 
+                                    double                      gridres,
+                                    int                         nbatch,
+                                    int                         nplane,
+                                    const float* __restrict__   planes,
+                                    const float* __restrict__   x,
+                                    const float* __restrict__   y,
+                                    const float* __restrict__   z,
+                                    const OutType* __restrict__ weight,
+                                    cudaTextureObject_t         kernel_tex,
+                                    const InType* __restrict__  d_in,
+                                    OutType*                    d_out) {
+    int batch_no = blockIdx.x;
+    int pol_no = threadIdx.y;
+    int vi_s = batch_no*nbaseline*npol+pol_no;
+    int grid_s = batch_no*npol*gridsize*gridsize + pol_no*gridsize*gridsize;
+    
+    extern __shared__ float shared[];
+    
+    float* xdata = shared;
+    float* ydata = xdata + nbaseline * npol;
+    float* zdata = ydata + nbaseline * npol;
+
+    for(int i = threadIdx.x; i < nbaseline; i += blockDim.x){
+        xdata[i*npol + pol_no] = x[(vi_s + npol * i) / npol];
+        ydata[i*npol + pol_no] = y[(vi_s + npol * i) / npol];
+        zdata[i*npol + pol_no] = z[(vi_s + npol * i) / npol];
+    }
+
+    __syncthreads();
+    
+    for(int i = threadIdx.x; i < maxsupport * maxsupport; i += blockDim.x) {
+        int myU = i % maxsupport;
+        int myV = i / maxsupport;
+        
+        int grid_point_u = myU;
+        int grid_point_v = myV;
+        int grid_point_p = 0;
+        OutType sum = OutType(0.0, 0.0);
+
+        int vi, plane;
+        vi = plane = 0;
+        while(plane < nplane) {
+            if( z[vi+vi_s] >= planes[plane] && z[vi+vi_s] < planes[plane+1] ) {
+                break;
+            }
+            plane++;
+        }
+        
+        for(vi = 0; vi < (nbaseline*npol); vi+=npol) {
+            float xf = xdata[vi+pol_no]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2);
+            int xl = (int) (xf);
+            float yf = ydata[vi+pol_no]/gridres + gridsize/2 - maxsupport/2 + (1 - maxsupport%2);
+            int yl = (int) (yf);
+            
+            // Find the correct z-plane to grid to
+            if( z[vi+pol_no] < planes[plane] || z[vi+pol_no] >= planes[plane+1] ) {
+                plane = 0;
+                while(plane < nplane) {
+                    if( zdata[vi+pol_no] >= planes[plane] && zdata[vi+pol_no] < planes[plane+1] ) {
+                        break;
+                    }
+                    plane++;
+                }
+            }
+            
+            // Determine convolution point. This is basically just an
+            // optimised way to calculate.
+            //int myConvU = myU - u;
+            //int myConvV = myV - v;
+            int myConvU = 0;
+            int myConvV = 0;
+            if( maxsupport > 1 ) {
+                myConvU = (xl - myU) % maxsupport;
+                myConvV = (yl - myV) % maxsupport;    
+                if (myConvU < 0) myConvU += maxsupport;
+                if (myConvV < 0) myConvV += maxsupport;
+            }
+            float myConvUO = myConvU*oversample - ((xf - xl)*oversample) + oversample/2 - (1 - oversample%2);
+            float myConvVO = myConvV*oversample - ((yf - yl)*oversample) + oversample/2 - (1 - oversample%2);
+            if (myConvUO < 0) myConvUO += maxsupport*oversample;
+            if (myConvVO < 0) myConvVO += maxsupport*oversample;
+            
+            // Determine grid point. Because of the above we know here that
+            //   myGridU % max_supp = myU
+            //   myGridV % max_supp = myV
+            int myGridU = xl + myConvU - gridsize/2;
+            int myGridV = yl + myConvV - gridsize/2;
+            if( myGridU < 0 ) myGridU += gridsize;
+            if( myGridV < 0 ) myGridV += gridsize;
+            
+            // Grid point changed?
+            if (myGridU == grid_point_u && myGridV == grid_point_v && plane == grid_point_p) {
+                // Nothin'
+            } else {
+                // Atomically add to grid. This is the bottleneck of this kernel.
+                if( grid_point_u >= 0 && grid_point_u < gridsize && \
+                    grid_point_v >= 0 && grid_point_v < gridsize ) {
+                    atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x);
+                    atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y);
+                }
+                // Switch to new point
+                sum = OutType(0.0, 0.0);
+                grid_point_u = myGridU;
+                grid_point_v = myGridV;
+                grid_point_p = plane;
+            }
+            
+            //TODO: Re-do the w-kernel/gcf for our data.
+            OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+            // Sum up
+            InType temp = d_in[vi+vi_s];
+            OutType vi_v = OutType(temp.x, temp.y);
+            sum = Complexfcma(px, vi_v, sum);
+        }
+        
+        if( grid_point_u >= 0 && grid_point_u < gridsize && \
+            grid_point_v >= 0 && grid_point_v < gridsize ) {
+            atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].x, sum.x);
+            atomicAdd(&d_out[grid_point_p*nbatch*npol*gridsize*gridsize + grid_s + gridsize*grid_point_v + grid_point_u].y, sum.y);
+        }
+    }
+}
+
+
+template<typename InType, typename OutType>
+inline void launch_orville_kernel(int      nbaseline,
+                                  int      npol,
+                                  bool     polmajor,
+                                  int      maxsupport, 
+                                  int      oversample,
+                                  int      gridsize, 
+                                  double   gridres,
+                                  int      nbatch,
+                                  int      nplane,
+                                  float*   planes,
+                                  float*   xpos,
+                                  float*   ypos,
+                                  float*   zpos,
+                                  OutType* weight,
+                                  OutType* kernel,
+                                  InType*  d_in,
+                                  OutType* d_out,
+                                  cudaStream_t stream=0) {
+    //cout << "LAUNCH for " << nelement << endl;
+    dim3 block(8,1);
+    dim3 grid(nbatch*npol,1);
+    if( polmajor ) {
+        npol = 1;
+    } else {
+        block.y = npol;
+        grid.x = nbatch;
+    }
+    /*
+    cout << "  Block size is " << block.x << " by " << block.y << endl;
+    cout << "  Grid  size is " << grid.x << " by " << grid.y << endl;
+    */
+    
+    // Determine how to create the texture object
+    // NOTE:  Assumes some type of complex float
+    cudaChannelFormatKind channel_format = cudaChannelFormatKindFloat;
+    int dx = 32;
+    int dy = 32;
+    int dz = 0;
+    int dw = 0;
+    if( sizeof(OutType) == sizeof(Complex64) ) {
+        channel_format = cudaChannelFormatKindUnsigned;
+        dz = 32;
+        dw = 32;
+    }
+    
+    // Create texture object
+    cudaResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+    resDesc.resType = cudaResourceTypeLinear;
+    resDesc.res.linear.devPtr = kernel;
+    resDesc.res.linear.desc.f = channel_format;
+    resDesc.res.linear.desc.x = dx;
+    resDesc.res.linear.desc.y = dy;
+    resDesc.res.linear.desc.z = dz;
+    resDesc.res.linear.desc.w = dw;
+    resDesc.res.linear.sizeInBytes = maxsupport*oversample*sizeof(OutType);
+    
+    cudaTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.readMode = cudaReadModeElementType;
+    
+    cudaTextureObject_t kernel_tex;
+    BF_CHECK_CUDA_EXCEPTION(cudaCreateTextureObject(&kernel_tex, &resDesc, &texDesc, NULL),
+                            BF_STATUS_INTERNAL_ERROR);
+    
+    void* args[] = {&nbaseline,
+                    &npol,
+                    &maxsupport,
+                    &oversample,
+                    &gridsize, 
+                    &gridres,
+                    &nbatch,
+                    &nplane,
+                    &planes,
+                    &xpos,
+                    &ypos,
+                    &zpos,
+                    &weight,
+                    &kernel_tex,
+                    &d_in,
+                    &d_out};
+    size_t loc_size = 3 * nbaseline * npol * sizeof(float);
+    size_t shared_mem_size = 16384; //Just keep this vanilla for now
+    if(loc_size <= shared_mem_size) {
+        BF_CHECK_CUDA_EXCEPTION(cudaLaunchKernel((void*)orville_kernel_sloc<InType,OutType>,
+                                                 grid, block,
+                                                 &args[0], 3*nbaseline*npol*sizeof(float), stream),
+                                BF_STATUS_INTERNAL_ERROR);
+    } else {
+        BF_CHECK_CUDA_EXCEPTION(cudaLaunchKernel((void*)orville_kernel_spln<InType,OutType>,
+                                                 grid, block,
+                                                 &args[0], (nplane+1)*sizeof(float), stream),
+                                BF_STATUS_INTERNAL_ERROR);
+    }
+    
+    BF_CHECK_CUDA_EXCEPTION(cudaDestroyTextureObject(kernel_tex),
+                            BF_STATUS_INTERNAL_ERROR);
+}
+                   
+
+int compare_z_values(const void* a, const void* b) {
+    if( *(float*)a < *(float*)b ) return -1;
+    if( *(float*)a > *(float*)b ) return  1;
+    return 0;
+}
+
+template<typename InType>
+InType signed_sqrt(InType x) {
+    if( x < 0 ) {
+        return -sqrt(-x);
+    } else {
+        return sqrt(x);
+    }
+}
+
+class BForville_impl {
+    typedef int    IType;
+    typedef double FType;
+public: // HACK WAR for what looks like a bug in the CUDA 7.0 compiler
+    typedef float  DType;
+private:
+    IType        _ntimechan;
+    IType        _nbaseline;
+    IType        _npol;
+    bool         _polmajor;
+    IType        _maxsupport;
+    IType        _oversample;
+    IType        _gridsize;
+    FType        _gridres;
+    FType        _gridwres;
+    IType        _nxyz = 0;
+    float*       _x = NULL;
+    float*       _y = NULL;
+    float*       _z = NULL;
+    void*        _weight = NULL;
+    BFdtype      _tkernel = BF_DTYPE_INT_TYPE;
+    void*        _kernel = NULL;
+    IType        _nplane = 0;
+    float*       _planes = NULL;
+    float*       _midpoints = NULL;
+    IType        _plan_stride;
+    Workspace    _plan_storage;
+    // TODO: Use something other than Thrust
+    thrust::device_vector<char> _dv_plan_storage;
+    cudaStream_t _stream;
+public:
+    BForville_impl() : _ntimechan(1), _nbaseline(1), _npol(1), _polmajor(true), \
+                      _maxsupport(1), _oversample(1), _stream(g_cuda_stream) {}
+    inline IType ntimechan()  const { return _ntimechan;  }
+    inline IType nbaseline()  const { return _nbaseline;  }
+    inline IType npol()       const { return _npol;       }
+    inline bool polmajor()    const { return _polmajor;   }
+    inline IType maxsupport() const { return _maxsupport; }
+    inline IType oversample() const { return _oversample; }
+    inline IType gridsize()   const { return _gridsize;   }
+    inline FType gridres()    const { return _gridres;    }
+    inline FType grdiwres()   const { return _gridwres;   }
+    inline IType nxyz()       const { return _nxyz;       }
+    inline IType nplane()     const { return _nplane;     }
+    inline IType tkernel()    const { return _tkernel;    }
+    void init(IType ntimechan,
+              IType nbaseline,
+              IType npol,
+              bool  polmajor,
+              IType maxsupport, 
+              IType oversample,
+              IType gridsize,
+              FType gridres,
+              FType gridwres) {
+        BF_TRACE();
+        _ntimechan  = ntimechan;
+        _nbaseline  = nbaseline;
+        _npol       = npol;
+        _polmajor   = polmajor;
+        _maxsupport = maxsupport;
+        _oversample = oversample;
+        _gridsize   = gridsize + maxsupport;
+        // Find an optimal grid size that allows for at least 'maxsupport' padding
+        // NOTE:  "Optimal" here means that is can be factored into 2, 3, 5, and 7,
+        //        exclusively.
+        IType i, value;
+        bool found_opt_gridsize = false;
+        for(i=_gridsize; i<(IType) round(_gridsize*1.15); i++) {
+            value = i;
+            while( value > 1 ) {
+                if( value % 2 == 0 ) {
+                    value /= 2;
+                } else if( value % 3 == 0 ) {
+                    value /= 3;
+                } else if( value % 5 == 0 ) {
+                    value /= 5;
+                } else if( value % 7 == 0 ) {
+                    value /= 7;
+                } else {
+                    break;
+                }
+            }
+            if( value == 1 ) {
+                found_opt_gridsize = true;
+                _gridsize = i;
+                break;
+            }
+        }
+        BF_ASSERT_EXCEPTION(found_opt_gridsize, BF_STATUS_INVALID_SHAPE);
+        _gridres    = 1.0 / (2.0*_gridsize) / sin(gridres*M_PI/180.0/2.0);
+        _gridwres   = gridwres;
+    }
+    bool init_plan_storage(void* storage_ptr, BFsize* storage_size) {
+        BF_TRACE();
+        BF_TRACE_STREAM(_stream);
+        enum {
+            ALIGNMENT_BYTES = 512,
+            ALIGNMENT_ELMTS = ALIGNMENT_BYTES / sizeof(float)
+        };
+        Workspace workspace(ALIGNMENT_BYTES);
+        _plan_stride = round_up(MAX_W_PLANES+1, ALIGNMENT_ELMTS);
+        workspace.reserve(MAX_W_PLANES+1, &_planes);
+        workspace.reserve(MAX_W_PLANES+1, &_midpoints);
+        if( storage_size ) {
+            if( !storage_ptr ) {
+                // Return required storage size
+                *storage_size = workspace.size();
+                return false;
+            } else {
+                BF_ASSERT_EXCEPTION(*storage_size >= workspace.size(),
+                                    BF_STATUS_INSUFFICIENT_STORAGE);
+            }
+        } else {
+            // Auto-allocate storage
+            BF_ASSERT_EXCEPTION(!storage_ptr, BF_STATUS_INVALID_ARGUMENT);
+            _dv_plan_storage.resize(workspace.size());
+            storage_ptr = thrust::raw_pointer_cast(&_dv_plan_storage[0]);
+        }
+        workspace.commit(storage_ptr);
+        
+        this->reset_state();
+        return true;
+    }
+    void set_positions(BFarray const* positions) { 
+        BF_TRACE();
+        BF_TRACE_STREAM(_stream);
+        BF_ASSERT_EXCEPTION(positions->dtype == BF_DTYPE_F32, BF_STATUS_UNSUPPORTED_DTYPE);
+        
+        int npositions = positions->shape[1];
+        int nbl = positions->shape[positions->ndim-1];
+        int stride = positions->shape[1];
+        for(int i=2; i<positions->ndim-1; ++i) {
+            npositions *= positions->shape[i];
+            stride *= positions->shape[i];
+        }
+        stride *= positions->shape[positions->ndim-1];
+        _nxyz = npositions;
+        _x = (float *) positions->data;
+        _y = _x + stride;
+        _z = _y + stride;
+        
+        float* zsorted;
+        zsorted = (float*) malloc(npositions*nbl*sizeof(float));
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(zsorted, 
+                                                _z, 
+                                                npositions*nbl*sizeof(float), 
+                                                cudaMemcpyDeviceToHost,
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED);
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaStreamSynchronize(_stream),
+                                BF_STATUS_DEVICE_ERROR );
+        
+        qsort(zsorted, npositions*nbl, sizeof(float), &compare_z_values);
+        /*
+        std::cout << npositions*nbl << " z values range from " << *(zsorted+0) << " to " << *(zsorted+npositions-1) << std::endl;
+        */
+        
+        _nplane = 1;
+        int count = 1;
+        float* planes;
+        float* midpoints;
+        planes = (float*) malloc((MAX_W_PLANES+1)*sizeof(float));
+        midpoints = (float*) malloc((MAX_W_PLANES+1)*sizeof(float));
+        *(planes + 0) = *(zsorted + 0);
+        *(midpoints + 0) = *(zsorted + 0);
+        for(int i=1; i<npositions*nbl; i++) {
+            if( signed_sqrt(*(zsorted + i)) > signed_sqrt(*(planes + _nplane - 1)) + _gridwres ) {
+                *(midpoints + _nplane - 1) /= count;
+                count = 0;
+                
+                _nplane++;
+                BF_ASSERT_EXCEPTION(_nplane < MAX_W_PLANES, BF_STATUS_INTERNAL_ERROR);
+                *(planes + _nplane - 1) = *(zsorted + i);
+                *(midpoints + _nplane - 1) = *(zsorted + i);
+                count++;
+            } else {
+                *(midpoints + _nplane - 1) += *(zsorted + i);
+                count++;
+            }
+        }
+        *(planes + _nplane) = *(zsorted + npositions*nbl - 1) + _gridwres;
+        *(midpoints + _nplane - 1) += *(zsorted + npositions*nbl - 1);
+        count++;
+        *(midpoints + _nplane - 1) /= count;
+        
+        /* std::cout << "Planes are:" << std::endl;
+        for(int i=0; i<_nplane; i++) {
+            std::cout << "  " << i << ": " << *(planes + i) << " to " << *(planes + i + 1) \
+                      << " with " << *(midpoints + i) << std::endl;
+        } */
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_planes, 
+                                                planes, 
+                                                (MAX_W_PLANES+1)*sizeof(float), 
+                                                cudaMemcpyHostToDevice,
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED);
+        BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_midpoints, 
+                                                midpoints, 
+                                                (MAX_W_PLANES+1)*sizeof(float), 
+                                                cudaMemcpyHostToDevice,
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED);
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaStreamSynchronize(_stream),
+                                BF_STATUS_DEVICE_ERROR );
+        
+        free(zsorted);
+        free(planes);
+        free(midpoints);
+    }
+    void get_midpoints(BFarray const* midpoints) {
+        BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(midpoints->data,
+                                                _midpoints,
+                                                _nplane*sizeof(float),
+                                                cudaMemcpyDeviceToHost,
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED);
+        
+        BF_CHECK_CUDA_EXCEPTION( cudaStreamSynchronize(_stream),
+                                 BF_STATUS_DEVICE_ERROR );
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR);
+    }
+    void set_weights(BFarray const* weights) {
+        BF_TRACE();
+        BF_TRACE_STREAM(_stream);
+        BF_ASSERT_EXCEPTION(weights->dtype == BF_DTYPE_CF32 \
+                                              || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE);
+        
+        if( _kernel == NULL ) {
+            _tkernel = weights->dtype;
+        }
+        _weight = (void*) weights->data;
+    }
+    void set_kernel(BFarray const* kernel) {
+        BF_TRACE();
+        BF_TRACE_STREAM(_stream);
+        BF_ASSERT_EXCEPTION(kernel->dtype == BF_DTYPE_CF32 \
+                                             || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE);
+        
+        if( _weight == NULL ) {
+            _tkernel = kernel->dtype;
+        }
+        _kernel = (void*) kernel->data;
+    }
+    void reset_state() {
+        BF_ASSERT_EXCEPTION(_planes != NULL,  BF_STATUS_INVALID_STATE);
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR);
+        
+        // Reset the state
+        _nplane = 0;
+        BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_planes,
+                                                0,
+                                                sizeof(float)*(MAX_W_PLANES+1),
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED );
+        BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_midpoints,
+                                                0,
+                                                sizeof(float)*(MAX_W_PLANES+1),
+                                                _stream),
+                                BF_STATUS_MEM_OP_FAILED );
+        
+        BF_CHECK_CUDA_EXCEPTION( cudaStreamSynchronize(_stream),
+                                 BF_STATUS_DEVICE_ERROR );
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR);
+    }
+    void execute(BFarray const* in, BFarray const* out) {
+        BF_TRACE();
+        BF_TRACE_STREAM(_stream);
+        BF_ASSERT_EXCEPTION(_x != NULL,      BF_STATUS_INVALID_STATE);
+        BF_ASSERT_EXCEPTION(_y != NULL,      BF_STATUS_INVALID_STATE);
+        BF_ASSERT_EXCEPTION(_z != NULL,      BF_STATUS_INVALID_STATE);
+        BF_ASSERT_EXCEPTION(_weight != NULL, BF_STATUS_INVALID_STATE);
+        BF_ASSERT_EXCEPTION(_kernel != NULL, BF_STATUS_INVALID_STATE);
+        BF_ASSERT_EXCEPTION(out->dtype == BF_DTYPE_CF32 \
+                                          || BF_DTYPE_CF64, BF_STATUS_UNSUPPORTED_DTYPE);
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR);
+        
+        int nbatch = in->shape[0];
+        
+#define LAUNCH_ORVILLE_KERNEL(IterType,OterType) \
+        launch_orville_kernel(_nbaseline, _npol, _polmajor, _maxsupport, _oversample, \
+                             _gridsize, _gridres, nbatch, _nplane, _planes, \
+                             _x, _y, _z, (OterType)_weight, (OterType)_kernel, \
+                             (IterType)in->data, (OterType)out->data, \
+                             _stream)
+        
+        switch( in->dtype ) {
+            case BF_DTYPE_CI4:
+                if( in->big_endian ) {
+                    switch( out->dtype ) {
+                        case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(nibble2*, Complex32*);  break;
+                        case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(nibble2*, Complex64*);  break;
+                        default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                    };
+                } else {
+                    switch( out->dtype ) {
+                        case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(blenib2*, Complex32*);  break;
+                        case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(blenib2*, Complex64*);  break;
+                        default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                    };
+                }
+                break;
+            case BF_DTYPE_CI8:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(char2*, Complex32*);  break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(char2*, Complex64*);  break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                };
+                break;
+            case BF_DTYPE_CI16:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(short2*, Complex32*); break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(short2*, Complex64*); break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                }
+                break;
+            case BF_DTYPE_CI32:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(int2*, Complex32*); break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(int2*, Complex64*); break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                }
+                break;
+            case BF_DTYPE_CI64:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(long2*, Complex32*); break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(long2*, Complex64*); break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                }
+                break;
+            case BF_DTYPE_CF32:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(float2*, Complex32*);   break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(float2*, Complex64*);   break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                }
+                break;
+            case BF_DTYPE_CF64:
+                switch( out->dtype ) {
+                    case BF_DTYPE_CF32: LAUNCH_ORVILLE_KERNEL(double2*, Complex32*);  break;
+                    case BF_DTYPE_CF64: LAUNCH_ORVILLE_KERNEL(double2*, Complex64*);  break;
+                    default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+                }
+                break;
+            default: BF_ASSERT_EXCEPTION(false, BF_STATUS_UNSUPPORTED_DTYPE);
+        }
+#undef LAUNCH_ORVILLE_KERNEL
+        
+        BF_CHECK_CUDA_EXCEPTION(cudaGetLastError(), BF_STATUS_INTERNAL_ERROR);
+    }
+    cudaStream_t get_stream() {
+        return _stream;
+    }
+    void set_stream(cudaStream_t stream) {
+        _stream = stream;
+    }
+};
+
+BFstatus bfOrvilleCreate(BForville* plan_ptr) {
+    BF_TRACE();
+    BF_ASSERT(plan_ptr, BF_STATUS_INVALID_POINTER);
+    BF_TRY_RETURN_ELSE(*plan_ptr = new BForville_impl(),
+                       *plan_ptr = 0);
+}
+
+BFstatus bfOrvilleInit(BForville      plan,
+                       BFarray const* positions,
+                       BFarray const* weights,
+                       BFarray const* kernel1D,
+                       BFsize         gridsize,
+                       double         gridres,
+                       double         gridwres,
+                       BFsize         oversample,
+                       BFbool         polmajor,
+                       BFspace        space,
+                       void*          plan_storage,
+                       BFsize*        plan_storage_size) {
+    BF_TRACE();
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(positions,                                BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(positions->ndim >= 3,                     BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(positions->shape[0] == 3, BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(space_accessible_from(positions->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    BF_ASSERT(weights,                                BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(weights->ndim >= 3,                     BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(space_accessible_from(weights->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    BF_ASSERT(kernel1D,                              BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(kernel1D->ndim == 1,                   BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(space_accessible_from(kernel1D->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    BF_ASSERT(space_accessible_from(space, BF_SPACE_CUDA),
+              BF_STATUS_UNSUPPORTED_SPACE);
+    
+    // Discover the dimensions of the positions/weights/kernel.
+    int npositions, nweights,nbaseline, npol, maxsupport;
+    npositions = positions->shape[1];
+    for(int i=2; i<positions->ndim-1; ++i) {
+        npositions *= positions->shape[i];
+    }
+    
+    nweights = weights->shape[0];
+    for(int i=1; i<weights->ndim-2; ++i) {
+        nweights *= weights->shape[i];
+    }
+    if( polmajor ) {
+         npol = weights->shape[weights->ndim-2];
+         nbaseline = weights->shape[weights->ndim-1];
+    } else {
+        nbaseline = weights->shape[weights->ndim-2];
+        npol = weights->shape[weights->ndim-1];
+    }
+    
+    std::cout << "npositions: " << npositions << std::endl;
+    std::cout << "nbaseline:  " << nbaseline << std::endl;
+    std::cout << "npol:       " << npol << std::endl;
+    std::cout << "nweights:   " << nweights << std::endl;
+    
+    maxsupport = kernel1D->shape[kernel1D->ndim-1] / oversample;
+    
+    // Validate
+    BF_ASSERT(npositions == nweights, BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(positions->shape[positions->ndim-1] == nbaseline, BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(maxsupport % 2 == 1, BF_STATUS_UNSUPPORTED_SHAPE);
+//     BF_ASSERT(oversample % 2 == 1, BF_STATUS_UNSUPPORTED_SHAPE);
+    
+    BF_TRY(plan->init(npositions, nbaseline, npol, polmajor, maxsupport, oversample, gridsize, gridres, gridwres));
+    BF_TRY(plan->init_plan_storage(plan_storage, plan_storage_size));
+    BF_TRY(plan->set_positions(positions));
+    BF_TRY(plan->set_weights(weights));
+    BF_TRY_RETURN(plan->set_kernel(kernel1D));
+}
+BFstatus bfOrvilleGetStream(BForville plan,
+                            void*     stream) {
+    BF_TRACE();
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(stream, BF_STATUS_INVALID_POINTER);
+    BF_TRY_RETURN(*(cudaStream_t*)stream = plan->get_stream());
+}
+BFstatus bfOrvilleSetStream(BForville    plan,
+                            void const* stream) {
+    BF_TRACE();
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(stream, BF_STATUS_INVALID_POINTER);
+    BF_TRY_RETURN(plan->set_stream(*(cudaStream_t*)stream));
+}
+BFstatus bfOrvilleSetPositions(BForville      plan,
+                               BFarray const* positions) {
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(positions,            BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(positions->ndim >= 3, BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(positions->shape[positions->ndim-2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(positions->shape[0] == 3,                 BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(space_accessible_from(positions->space,   BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    
+    BF_TRY_RETURN(plan->set_positions(positions));
+}
+BFstatus bfOrvilleGetProjectionSetup(BForville plan,
+                                     int* ntimechan,
+                                     int* gridsize,
+                                     int* npol,
+                                     int* nplane,
+                                     BFarray const* midpoints) {
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(midpoints,            BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(midpoints->ndim == 1, BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(midpoints->shape[0] >= plan->nplane(), BF_STATUS_INVALID_SHAPE);
+    
+    *ntimechan = plan->ntimechan();
+    *gridsize = plan->gridsize();
+    *npol = plan->npol();
+    *nplane = plan->nplane();
+    
+    BF_TRY_RETURN(plan->get_midpoints(midpoints));
+}
+BFstatus bfOrvilleSetWeights(BForville      plan,
+                             BFarray const* weights) {
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(weights,            BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(weights->ndim >= 3, BF_STATUS_INVALID_SHAPE  );
+    if( plan->polmajor() ) {
+        BF_ASSERT(weights->shape[weights->ndim-2] == plan->npol(),      BF_STATUS_INVALID_SHAPE  );
+        BF_ASSERT(weights->shape[weights->ndim-1] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE  );
+    } else {
+        BF_ASSERT(weights->shape[weights->ndim-2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE  );
+        BF_ASSERT(weights->shape[weights->ndim-1] == plan->npol(),      BF_STATUS_INVALID_SHAPE  );
+    }
+    BF_ASSERT(space_accessible_from(weights->space,   BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    
+    if( plan->tkernel() != BF_DTYPE_INT_TYPE) {
+        BF_ASSERT(weights->dtype == plan->tkernel(),    BF_STATUS_UNSUPPORTED_DTYPE);
+    }
+    
+    BF_TRY_RETURN(plan->set_weights(weights));
+}
+BFstatus bfOrvilleSetKernel(BForville      plan, 
+                            BFarray const* kernel1D) {
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(kernel1D,            BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(kernel1D->ndim == 1, BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(kernel1D->shape[kernel1D->ndim-1] == plan->maxsupport()*plan->oversample(), \
+              BF_STATUS_INVALID_SHAPE  );
+    BF_ASSERT(space_accessible_from(kernel1D->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    
+    if( plan->tkernel() != BF_DTYPE_INT_TYPE) {
+        BF_ASSERT(kernel1D->dtype == plan->tkernel(),    BF_STATUS_UNSUPPORTED_DTYPE);
+    }
+    
+    BF_TRY_RETURN(plan->set_kernel(kernel1D));
+}
+BFstatus bfOrvilleExecute(BForville      plan,
+                          BFarray const* in,
+                          BFarray const* out) {
+    BF_TRACE();
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    BF_ASSERT(in,   BF_STATUS_INVALID_POINTER);
+    BF_ASSERT(out,  BF_STATUS_INVALID_POINTER);
+    BF_ASSERT( in->ndim >= 3,          BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(out->ndim == in->ndim+2, BF_STATUS_INVALID_SHAPE);
+    
+    BFarray in_flattened;
+    if( in->ndim > 3 ) {
+        // Keep the last two dim but attempt to flatten all others
+        unsigned long keep_dims_mask = padded_dims_mask(in);
+        keep_dims_mask |= 0x1 << (in->ndim-1);
+        keep_dims_mask |= 0x1 << (in->ndim-2);
+        keep_dims_mask |= 0x1 << (in->ndim-3);
+        flatten(in,   &in_flattened, keep_dims_mask);
+        in  =  &in_flattened;
+        BF_ASSERT(in_flattened.ndim == 3, BF_STATUS_UNSUPPORTED_SHAPE);
+    }
+    /*
+    std::cout << "ndim = " << in->ndim << std::endl;
+    std::cout << "   0 = " << in->shape[0] << std::endl;
+    std::cout << "   1 = " << in->shape[1] << std::endl;
+    std::cout << "   2 = " << in->shape[2] << std::endl;
+    */
+    BF_ASSERT( in->shape[0] == plan->nxyz(),     BF_STATUS_INVALID_SHAPE);
+    if( plan->polmajor() ) {
+        BF_ASSERT( in->shape[1] == plan->npol(),      BF_STATUS_INVALID_SHAPE);
+        BF_ASSERT( in->shape[2] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE);
+    } else {
+        BF_ASSERT( in->shape[1] == plan->nbaseline(), BF_STATUS_INVALID_SHAPE);
+        BF_ASSERT( in->shape[2] == plan->npol(),      BF_STATUS_INVALID_SHAPE);
+    }
+    
+    BFarray out_flattened;
+    if( out->ndim > 4 ) {
+        // Keep the last three dim but attempt to flatten all others
+        unsigned long keep_dims_mask = padded_dims_mask(out);
+        keep_dims_mask |= 0x1;
+        keep_dims_mask |= 0x1 << (out->ndim-1);
+        keep_dims_mask |= 0x1 << (out->ndim-2);
+        keep_dims_mask |= 0x1 << (out->ndim-3);
+        keep_dims_mask |= 0x1 << (out->ndim-4);
+        flatten(out,   &out_flattened, keep_dims_mask);
+        out  =  &out_flattened;
+        BF_ASSERT(out_flattened.ndim == 5, BF_STATUS_UNSUPPORTED_SHAPE);
+    }
+    /*
+    std::cout << "ndim = " << out->ndim << std::endl;
+    std::cout << "   0 = " << out->shape[0] << std::endl;
+    std::cout << "   1 = " << out->shape[1] << std::endl;
+    std::cout << "   2 = " << out->shape[2] << std::endl;
+    std::cout << "   3 = " << out->shape[3] << std::endl;
+    */
+    BF_ASSERT(out->shape[0] == plan->nplane(),   BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(out->shape[1] == plan->nxyz(),     BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(out->shape[2] == plan->npol(),     BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(out->shape[3] == plan->gridsize(), BF_STATUS_INVALID_SHAPE);
+    BF_ASSERT(out->shape[4] == plan->gridsize(), BF_STATUS_INVALID_SHAPE);
+    
+    BF_ASSERT(out->dtype == plan->tkernel(),    BF_STATUS_UNSUPPORTED_DTYPE);
+    
+    BF_ASSERT(space_accessible_from( in->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    BF_ASSERT(space_accessible_from(out->space, BF_SPACE_CUDA), BF_STATUS_INVALID_SPACE);
+    BF_TRY_RETURN(plan->execute(in, out));
+}
+
+BFstatus bfOrvilleDestroy(BForville plan) {
+    BF_TRACE();
+    BF_ASSERT(plan, BF_STATUS_INVALID_HANDLE);
+    delete plan;
+    return BF_STATUS_SUCCESS;
+}
diff --git a/src/romein_kernels.cuh b/src/romein_kernels.cuh
new file mode 100644
index 000000000..c89b09b72
--- /dev/null
+++ b/src/romein_kernels.cuh
@@ -0,0 +1,42 @@
+//CUDA Includes
+#include <cuComplex.h>
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "math.h"
+#include <thrust/random.h>
+
+//Romein Include
+#include "bifrost/romein.h"    
+
+/*****************************
+      Device Functions
+******************************/
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+
+#else //Pre-pascal devices.
+
+__device__ inline double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+#endif
+
+__global__ void scatter_grid_kernel(cuComplex* fdata,
+				    cuComplex* uvgrid, //Our UV-Grid
+				    cuComplex* illum, // Illumination Pattern
+				    int* x,
+				    int* y,
+				    int* z,
+				    int max_support, //  Convolution size
+				    int grid_size,
+				    int data_size);
\ No newline at end of file

From a9378469c46426ac5fbf7a2c19080d3413cd74ef Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Mon, 8 Jul 2019 11:23:23 -0600
Subject: [PATCH 02/11] Updated the Makefile to build orville.cu.

---
 src/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 793b613f9..6885bf86e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,6 +15,8 @@ LIBBIFROST_OBJS = \
   address.o \
   udp_socket.o \
   udp_capture.o \
+  packet_writer.o \
+  disk_writer.o \
   udp_transmit.o \
   unpack.o \
   quantize.o \
@@ -33,7 +35,8 @@ ifndef NOCUDA
   reduce.o \
   fir.o \
   guantize.o \
-  gunpack.o
+  gunpack.o \
+  orville.o
 endif
 
 JIT_SOURCES ?= \

From 7aaaa76c9bd14c1170fb692171527abe1be7018c Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Tue, 9 Jul 2019 16:09:12 -0600
Subject: [PATCH 03/11] Removed a few debugging statements from orville.py.

---
 python/bifrost/orville.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index 0bba31dfe..84fd79c7a 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -195,8 +195,6 @@ def execute(self, idata, odata):
         _check( _bf.bfOrvilleExecute(self.obj,
                                      asarray(idata).as_BFarray(),
                                      asarray(self._subgrid).as_BFarray()) )
-        junk = self._subgrid.copy(space='system')
-        print junk.max()
         
         if self._subgrid.shape[0] > 1:
             # W project
@@ -219,8 +217,6 @@ def execute(self, idata, odata):
             self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize)
             self._fft.execute(self._subgrid, self._subgrid, inverse=False)
             self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize)
-            junk = self._subgrid.copy(space='system')
-            print 'post-W', junk.max()
             
             # Stack
             reduce(self._subgrid, self._stcgrid, op='sum')
@@ -237,16 +233,12 @@ def execute(self, idata, odata):
             self._fft = Fft()
             self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2))
             self._fft.execute(self._stcgrid, self._stcgrid, inverse=True)
-        junk = self._stcgrid.copy(space='system')
-        print 'post-IFFT', junk.max()
-        
+            
         ## Correct for the kernel
         map('img(c,i,j) /= corr(i)*corr(j)',
             {'img':self._stcgrid, 'corr':self._img_correction},
             axis_names=('c','i','j'), shape=self._stcgrid.shape)
-        junk = self._stcgrid.copy(space='system')
-        print 'post-corr', junk.max()
-        
+            
         # Shift and accumulate onto odata
         oshape = odata.shape
         odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])

From eec391a1a3f679b6546d74788d2bcf61735a9516 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Fri, 22 Nov 2019 10:30:32 -0700
Subject: [PATCH 04/11] Fixed a bug in orville.py that caused two different FFT
 plans to be treated as the same.

---
 python/bifrost/orville.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index 84fd79c7a..bf0e8e1c0 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -77,7 +77,8 @@ def __del__(self):
             del self._subgrid
             del self._stcgrid
             del self._w_kernels
-            del self._fft
+            del self._fft_im
+            del self._fft_wp
         except AttributeError:
             pass
         
@@ -127,10 +128,6 @@ def _set_image_correction_kernel(self, force=False):
         corr = numpy.fft.ifft(corr).real
         corr = numpy.fft.fftshift(corr)
         corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample
-        #import pylab
-        #pylab.plot(corr)
-        #pylab.plot(1/corr)
-        #pylab.show()
         corr = numpy.fft.fftshift(corr)
         self._img_correction = ndarray(corr, dtype='f32', space='cuda')
         
@@ -201,11 +198,11 @@ def execute(self, idata, odata):
             ## FFT
             self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize)
             try:
-                self._fft.execute(self._subgrid, self._subgrid, inverse=True)
+                self._fft_wp.execute(self._subgrid, self._subgrid, inverse=True)
             except AttributeError:
-                self._fft = Fft()
-                self._fft.init(self._subgrid, self._subgrid, axes=(1,2))
-                self._fft.execute(self._subgrid, self._subgrid, inverse=True)
+                self._fft_wp = Fft()
+                self._fft_wp.init(self._subgrid, self._subgrid, axes=(1,2))
+                self._fft_wp.execute(self._subgrid, self._subgrid, inverse=True)
             self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize)
                 
             ## Project
@@ -215,7 +212,7 @@ def execute(self, idata, odata):
             
             ## IFFT
             self._subgrid = self._subgrid.reshape(-1,self._gridsize,self._gridsize)
-            self._fft.execute(self._subgrid, self._subgrid, inverse=False)
+            self._fft_wp.execute(self._subgrid, self._subgrid, inverse=False)
             self._subgrid = self._subgrid.reshape(self._nplane,self._ntimechan,self._npol,self._gridsize,self._gridsize)
             
             # Stack
@@ -228,11 +225,11 @@ def execute(self, idata, odata):
         ## IFFT
         self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize)
         try:
-            self._fft.execute(self._stcgrid, self._stcgrid, inverse=True)
+            self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True)
         except AttributeError:
-            self._fft = Fft()
-            self._fft.init(self._stcgrid, self._stcgrid, axes=(1,2))
-            self._fft.execute(self._stcgrid, self._stcgrid, inverse=True)
+            self._fft_im = Fft()
+            self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2))
+            self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True)
             
         ## Correct for the kernel
         map('img(c,i,j) /= corr(i)*corr(j)',

From 95afddc3c02e67640ffb661eb6afde8863652a4d Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 27 Nov 2019 09:34:49 -0700
Subject: [PATCH 05/11] Added a nearest neighbor kernel sampler as an
 alternative to the interpolation.  Cleaned up some of the orville.cu macro
 names.

---
 src/orville.cu | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/orville.cu b/src/orville.cu
index c4abbae79..f606995db 100644
--- a/src/orville.cu
+++ b/src/orville.cu
@@ -46,7 +46,11 @@ Implements the Romein convolutional algorithm onto a GPU using CUDA.
 
 #include <thrust/device_vector.h>
 
-#define MAX_W_PLANES  128
+// Maximum number of w planes to allow
+#define ORVILLE_MAX_W_PLANES  128
+
+// Use bilinear interpolation for the gridding kernel rather than nearest neighbor
+#define ORVILLE_USE_KERNEL_INTERP
 
 struct __attribute__((aligned(1))) nibble2 {
     // Yikes!  This is dicey since the packing order is implementation dependent!  
@@ -119,6 +123,20 @@ inline OutType WeightAndInterp2D(OutType             w,
 }
 
 
+template<typename OutType, typename IndexType>
+__device__
+inline OutType WeightAndNearestNeighbor2D(OutType             w,
+                                          cudaTextureObject_t kernel_tex, 
+                                          IndexType           x, 
+                                          IndexType           y, 
+                                          int                 size) {
+    OutType kxnn = fetch_cf<OutType>(kernel_tex, (int) round(x));
+    OutType kynn = fetch_cf<OutType>(kernel_tex, (int) round(y));
+    
+    return w*knn*knn;
+}
+
+
 template<typename InType, typename OutType>
 __global__ void orville_kernel_spln(int                         nbaseline,
                                     int                         npol,
@@ -241,7 +259,11 @@ __global__ void orville_kernel_spln(int                         nbaseline,
             }
             
             //TODO: Re-do the w-kernel/gcf for our data.
+#ifdef ORVILLE_USE_KERNEL_INTERP
             OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+#else
+            OutType px = WeightAndNearestNeighbor2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+#endif
             // Sum up
             InType temp = d_in[vi+vi_s];
             OutType vi_v = OutType(temp.x, temp.y);
@@ -371,7 +393,11 @@ __global__ void orville_kernel_sloc(int                         nbaseline,
             }
             
             //TODO: Re-do the w-kernel/gcf for our data.
+#ifdef ORVILLE_USE_KERNEL_INTERP
             OutType px = WeightAndInterp2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+#else
+            OutType px = WeightAndNearestNeighbor2D(weight[vi+vi_s], kernel_tex, myConvVO, myConvUO, maxsupport * oversample);
+#endif
             // Sum up
             InType temp = d_in[vi+vi_s];
             OutType vi_v = OutType(temp.x, temp.y);
@@ -603,9 +629,9 @@ public:
             ALIGNMENT_ELMTS = ALIGNMENT_BYTES / sizeof(float)
         };
         Workspace workspace(ALIGNMENT_BYTES);
-        _plan_stride = round_up(MAX_W_PLANES+1, ALIGNMENT_ELMTS);
-        workspace.reserve(MAX_W_PLANES+1, &_planes);
-        workspace.reserve(MAX_W_PLANES+1, &_midpoints);
+        _plan_stride = round_up(ORVILLE_MAX_W_PLANES+1, ALIGNMENT_ELMTS);
+        workspace.reserve(ORVILLE_MAX_W_PLANES+1, &_planes);
+        workspace.reserve(ORVILLE_MAX_W_PLANES+1, &_midpoints);
         if( storage_size ) {
             if( !storage_ptr ) {
                 // Return required storage size
@@ -666,8 +692,8 @@ public:
         int count = 1;
         float* planes;
         float* midpoints;
-        planes = (float*) malloc((MAX_W_PLANES+1)*sizeof(float));
-        midpoints = (float*) malloc((MAX_W_PLANES+1)*sizeof(float));
+        planes = (float*) malloc((ORVILLE_MAX_W_PLANES+1)*sizeof(float));
+        midpoints = (float*) malloc((ORVILLE_MAX_W_PLANES+1)*sizeof(float));
         *(planes + 0) = *(zsorted + 0);
         *(midpoints + 0) = *(zsorted + 0);
         for(int i=1; i<npositions*nbl; i++) {
@@ -676,7 +702,7 @@ public:
                 count = 0;
                 
                 _nplane++;
-                BF_ASSERT_EXCEPTION(_nplane < MAX_W_PLANES, BF_STATUS_INTERNAL_ERROR);
+                BF_ASSERT_EXCEPTION(_nplane < ORVILLE_MAX_W_PLANES, BF_STATUS_INTERNAL_ERROR);
                 *(planes + _nplane - 1) = *(zsorted + i);
                 *(midpoints + _nplane - 1) = *(zsorted + i);
                 count++;
@@ -698,13 +724,13 @@ public:
         
         BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_planes, 
                                                 planes, 
-                                                (MAX_W_PLANES+1)*sizeof(float), 
+                                                (ORVILLE_MAX_W_PLANES+1)*sizeof(float), 
                                                 cudaMemcpyHostToDevice,
                                                 _stream),
                                 BF_STATUS_MEM_OP_FAILED);
         BF_CHECK_CUDA_EXCEPTION(cudaMemcpyAsync(_midpoints, 
                                                 midpoints, 
-                                                (MAX_W_PLANES+1)*sizeof(float), 
+                                                (ORVILLE_MAX_W_PLANES+1)*sizeof(float), 
                                                 cudaMemcpyHostToDevice,
                                                 _stream),
                                 BF_STATUS_MEM_OP_FAILED);
@@ -760,12 +786,12 @@ public:
         _nplane = 0;
         BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_planes,
                                                 0,
-                                                sizeof(float)*(MAX_W_PLANES+1),
+                                                sizeof(float)*(ORVILLE_MAX_W_PLANES+1),
                                                 _stream),
                                 BF_STATUS_MEM_OP_FAILED );
         BF_CHECK_CUDA_EXCEPTION( cudaMemsetAsync(_midpoints,
                                                 0,
-                                                sizeof(float)*(MAX_W_PLANES+1),
+                                                sizeof(float)*(ORVILLE_MAX_W_PLANES+1),
                                                 _stream),
                                 BF_STATUS_MEM_OP_FAILED );
         

From 43d937a6f436377088ca58e317d5e4338011ec49 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 27 Nov 2019 09:45:22 -0700
Subject: [PATCH 06/11] Fixed a typo.

---
 src/orville.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/orville.cu b/src/orville.cu
index f606995db..47388b586 100644
--- a/src/orville.cu
+++ b/src/orville.cu
@@ -133,7 +133,7 @@ inline OutType WeightAndNearestNeighbor2D(OutType             w,
     OutType kxnn = fetch_cf<OutType>(kernel_tex, (int) round(x));
     OutType kynn = fetch_cf<OutType>(kernel_tex, (int) round(y));
     
-    return w*knn*knn;
+    return w*kxnn*kynn;
 }
 
 

From af65bc3ba86578cfa1a6c7975c09468ec13c89ee Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 27 Nov 2019 12:43:51 -0700
Subject: [PATCH 07/11] Merged the kernel correction and shift/accumulate map
 calls into one.

---
 python/bifrost/orville.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index bf0e8e1c0..2c525f87c 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -231,12 +231,28 @@ def execute(self, idata, odata):
             self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2))
             self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True)
             
-        ## Correct for the kernel
-        map('img(c,i,j) /= corr(i)*corr(j)',
-            {'img':self._stcgrid, 'corr':self._img_correction},
-            axis_names=('c','i','j'), shape=self._stcgrid.shape)
-            
-        # Shift and accumulate onto odata
+        ### Correct for the kernel
+        #map('img(c,i,j) /= corr(i)*corr(j)',
+        #    {'img':self._stcgrid, 'corr':self._img_correction},
+        #    axis_names=('c','i','j'), shape=self._stcgrid.shape)
+        #    
+        ## Shift and accumulate onto odata
+        #oshape = odata.shape
+        #odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])
+        #padding = self._gridsize - self._origsize
+        #offset = padding/2# - padding%2
+        #map("""
+        #    auto k = i + {offset} - {gridsize}/2;
+        #    auto l = j + {offset} - {gridsize}/2;
+        #    if( k < 0 ) k += {gridsize};
+        #    if( l < 0 ) l += {gridsize};
+        #    odata(c,i,j) += idata(c,k,l).real;
+        #    """.format(gridsize=self._gridsize, offset=offset), 
+        #    {'odata':odata, 'idata':self._stcgrid}, 
+        #    axis_names=('c','i','j'), shape=odata.shape)
+        #odata = odata.reshape(oshape)
+        
+        # Correct for the kernel, shift, and accumulate onto odata
         oshape = odata.shape
         odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])
         padding = self._gridsize - self._origsize
@@ -246,9 +262,9 @@ def execute(self, idata, odata):
             auto l = j + {offset} - {gridsize}/2;
             if( k < 0 ) k += {gridsize};
             if( l < 0 ) l += {gridsize};
-            odata(c,i,j) += idata(c,k,l).real;
+            odata(c,i,j) += idata(c,k,l).real / (corr(k) * corr(l));
             """.format(gridsize=self._gridsize, offset=offset), 
-            {'odata':odata, 'idata':self._stcgrid}, 
+            {'odata':odata, 'idata':self._stcgrid, 'corr':self._img_correction}, 
             axis_names=('c','i','j'), shape=odata.shape)
         odata = odata.reshape(oshape)
         

From a4a47ba7c445ddb3268e8d516ab72f3975930267 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 2 Jun 2021 16:23:52 -0600
Subject: [PATCH 08/11] Python3 changes.

---
 python/bifrost/orville.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index 2c525f87c..5b2fcba0c 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, The Bifrost Authors. All rights reserved.
+# Copyright (c) 2018-2021, The Bifrost Authors. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,8 +24,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from libbifrost import _bf, _check, _get, BifrostObject, _string2space
-from ndarray import ndarray, zeros, asarray, memset_array, copy_array
+# Python2 compatibility
+from __future__ import absolute_import, print_function, division
+
+from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space
+from bifrost.ndarray import ndarray, zeros, asarray, memset_array, copy_array
 
 from bifrost.fft import Fft
 from bifrost.map import map
@@ -106,7 +109,7 @@ def _get_gridding_setup(self, force=False):
             self._npol = npol
             self._nplane = nplane
             self._midpoints = midpoints
-            print "Working with %i planes" % self._nplane
+            print("Working with %i planes" % self._nplane)
             
     def _set_image_correction_kernel(self, force=False):
         self._get_gridding_setup(force=force)
@@ -119,21 +122,21 @@ def _set_image_correction_kernel(self, force=False):
             
         # Get the new image correction kernel
         corr = numpy.zeros(self._gridsize*self._oversample, dtype=self._dtype)
-        c = corr.size/2
-        d = self._kernel.size/2
+        c = corr.size//2
+        d = self._kernel.size//2
         corr[c-d:c+d] = self._kernel.copy(space='system')
         corr[:c-d] = numpy.arange(c-d) * corr[c-d]/(c-d)
         corr[c+d:] = corr[c-d] - numpy.arange(c-d) * corr[c-d]/(c-d)
         corr = numpy.fft.ifftshift(corr)
         corr = numpy.fft.ifft(corr).real
         corr = numpy.fft.fftshift(corr)
-        corr = corr[corr.size/2-self._gridsize/2-self._gridsize%2:corr.size/2+self._gridsize/2] * self._oversample
+        corr = corr[corr.size//2-self._gridsize//2-self._gridsize%2:corr.size//2+self._gridsize//2] * self._oversample
         corr = numpy.fft.fftshift(corr)
         self._img_correction = ndarray(corr, dtype='f32', space='cuda')
         
     def _get_lm(self, gridsize, gridres):
         m,l = numpy.indices((gridsize,gridsize))
-        l,m = numpy.where(l > gridsize/2, gridsize-l, -l), numpy.where(m > gridsize/2, m-gridsize, m)
+        l,m = numpy.where(l > gridsize//2, gridsize-l, -l), numpy.where(m > gridsize//2, m-gridsize, m)
         l,m = l.astype(numpy.float32)/gridsize/gridres, m.astype(numpy.float32)/gridsize/gridres
         return l,m
         

From e28e88275910c6643e991375707fdfaeb6c6eda1 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 8 Jan 2025 09:23:21 -0700
Subject: [PATCH 09/11] Add support for uniform and Briggs weighting.

---
 python/bifrost/orville.py | 71 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 5 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index 5b2fcba0c..f1edd932c 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, The Bifrost Authors. All rights reserved.
+# Copyright (c) 2018-2025, The Bifrost Authors. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +28,7 @@
 from __future__ import absolute_import, print_function, division
 
 from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space
-from bifrost.ndarray import ndarray, zeros, asarray, memset_array, copy_array
+from bifrost.ndarray import ndarray, zeros, empty_like, asarray, memset_array, copy_array
 
 from bifrost.fft import Fft
 from bifrost.map import map
@@ -41,7 +41,7 @@
 
 from bifrost.device import stream_synchronize as BFSync
 
-class Orville(BifrostObject):
+class Orville2(BifrostObject):
     def __init__(self):
         BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy)
     def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'):
@@ -70,6 +70,7 @@ def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, over
         self._oversample = oversample
         self._dtype = kernel.dtype
         self._space = space
+        self._update_beam = True
         
         # Build the image correction kernel and the gridding kernels
         self._set_image_correction_kernel()
@@ -173,20 +174,45 @@ def set_positions(self, positions):
         _check( _bf.bfOrvilleSetPositions(self.obj, 
                                           asarray(positions).as_BFarray()) )
         self._set_projection_kernels(force=True)
+        self._update_beam = True
         
     def set_weights(self, weights):
         weights = numpy.fft.fftshift(weights)
         _check( _bf.bfOrvilleSetWeights(self.obj, 
                                        asarray(weights).as_BFarray()) )
+        self._update_beam = True
         
     def set_kernel(self, kernel):
         self._kernel = kernel
         _check( _bf.bfOrvilleSetKernel(self.obj, 
                                        asarray(kernel).as_BFarray()) )
         self._set_image_correction_kernel()
+        self._update_beam = True
         
-    def execute(self, idata, odata):
+    def execute(self, idata, odata, weighting='natural'):
         # TODO: Work out how to integrate CUDA stream
+        
+        if self._update_beam:
+            bidata = empty_like(idata)
+            map("bdata = Complex<float>(1,0)", {'bdata': bidata}, shape=bidata.shape)
+            self._raw_beam = zeros(shape=(1,self._ntimechan,self._npol,self._gridsize,self._gridsize),
+                                   dtype=self._stcgrid.dtype, space='cuda')
+            
+            self._update_beam = False
+            try:
+                self._raw_beam = self.execute(bidata, self._raw_beam, weighting='_raw_beam')
+            except Exception as e:
+                self._update_beam = True
+                raise e
+                
+            rb = self._raw_beam.reshape(1,self._ntimechan,self._npol,self._gridsize*self._gridsize)
+            self._wavg = zeros(shape=(1,self._ntimechan,self._npol,1),
+                               dtype='cf32', space='cuda')
+            reduce(rb, self._wavg, op='sum')
+            self._w2avg = zeros(shape=(1,self._ntimechan,self._npol,1),
+                               dtype='f32', space='cuda')
+            reduce(rb, self._w2avg, op='pwrsum')
+            
         # Zero out the subgrids
         memset_array(self._subgrid, 0)
         self._stcgrid = self._stcgrid.reshape(1,self._ntimechan,self._npol,self._gridsize,self._gridsize)
@@ -224,6 +250,41 @@ def execute(self, idata, odata):
         else:
             copy_array(self._stcgrid, self._subgrid)
             
+        # Apply the weighting
+        if weighting == '_raw_beam':
+            copy_array(odata, self._stcgrid)
+            return odata
+            
+        elif weighting == 'natural':
+            pass
+            
+        elif weighting == 'uniform':
+           map("""
+                auto weight = bdata(0,i,j,k,l).real;
+                if( weight != 0.0 ) {
+                  idata(0,i,j,k,l) = idata(0,i,j,k,l) / weight;
+                }
+                """,
+                {'idata': self._stcgrid, 'bdata': self._raw_beam},
+                axis_names=('i','j','k','l'), shape=self._stcgrid.shape[1:]
+               )
+                
+        elif weighting.startswith('briggs'):
+            R = float(weighting.split('@', 1)[1])
+            map("""
+                auto weight = bdata(0,i,j,k,l).real;
+                auto S2 = 25.0 * powf(10.0, -2*{R}) / (w2avg(0,i,j,0) / wavg(0,i,j,0));
+                if( weight != 0.0 ) {{
+                    idata(0,i,j,k,l) = idata(0,i,j,k,l) / (1.0 + weight * S2);
+                }}
+                """.format(R=R),
+                {'idata': self._stcgrid, 'bdata': self._raw_beam, 'wavg': self._wavg, 'w2avg': self._w2avg},
+                axis_names=('i','j','k','l'), shape=self._stcgrid.shape[1:]
+               )
+                
+        else:
+            raise ValueError("Unknown weighting '%s'" % weighting)
+            
         # (u,v) plane -> image
         ## IFFT
         self._stcgrid = self._stcgrid.reshape(-1,self._gridsize,self._gridsize)
@@ -259,7 +320,7 @@ def execute(self, idata, odata):
         oshape = odata.shape
         odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])
         padding = self._gridsize - self._origsize
-        offset = padding/2# - padding%2
+        offset = padding//2# - padding%2
         map("""
             auto k = i + {offset} - {gridsize}/2;
             auto l = j + {offset} - {gridsize}/2;

From 3e39288d8def86cf1d60c13cc34c89416a8888be Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 8 Jan 2025 09:24:15 -0700
Subject: [PATCH 10/11] Orville2 -> Orville

---
 python/bifrost/orville.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index f1edd932c..272a22f96 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -41,7 +41,7 @@
 
 from bifrost.device import stream_synchronize as BFSync
 
-class Orville2(BifrostObject):
+class Orville(BifrostObject):
     def __init__(self):
         BifrostObject.__init__(self, _bf.bfOrvilleCreate, _bf.bfOrvilleDestroy)
     def init(self, positions, weights, kernel, gridsize, gridres, gridwres=0.5, oversample=1, polmajor=True, space='cuda'):

From e018626f085e9577071c171059209c612ea69f40 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Wed, 8 Jan 2025 09:57:03 -0700
Subject: [PATCH 11/11] Drop Py2 stuff and remove commented out code.

---
 python/bifrost/orville.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/python/bifrost/orville.py b/python/bifrost/orville.py
index 272a22f96..891a881a9 100644
--- a/python/bifrost/orville.py
+++ b/python/bifrost/orville.py
@@ -24,9 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Python2 compatibility
-from __future__ import absolute_import, print_function, division
-
 from bifrost.libbifrost import _bf, _check, _get, BifrostObject, _string2space
 from bifrost.ndarray import ndarray, zeros, empty_like, asarray, memset_array, copy_array
 
@@ -295,27 +292,6 @@ def execute(self, idata, odata, weighting='natural'):
             self._fft_im.init(self._stcgrid, self._stcgrid, axes=(1,2))
             self._fft_im.execute(self._stcgrid, self._stcgrid, inverse=True)
             
-        ### Correct for the kernel
-        #map('img(c,i,j) /= corr(i)*corr(j)',
-        #    {'img':self._stcgrid, 'corr':self._img_correction},
-        #    axis_names=('c','i','j'), shape=self._stcgrid.shape)
-        #    
-        ## Shift and accumulate onto odata
-        #oshape = odata.shape
-        #odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])
-        #padding = self._gridsize - self._origsize
-        #offset = padding/2# - padding%2
-        #map("""
-        #    auto k = i + {offset} - {gridsize}/2;
-        #    auto l = j + {offset} - {gridsize}/2;
-        #    if( k < 0 ) k += {gridsize};
-        #    if( l < 0 ) l += {gridsize};
-        #    odata(c,i,j) += idata(c,k,l).real;
-        #    """.format(gridsize=self._gridsize, offset=offset), 
-        #    {'odata':odata, 'idata':self._stcgrid}, 
-        #    axis_names=('c','i','j'), shape=odata.shape)
-        #odata = odata.reshape(oshape)
-        
         # Correct for the kernel, shift, and accumulate onto odata
         oshape = odata.shape
         odata = odata.reshape(-1,odata.shape[-2],odata.shape[-1])