From 2c4a3cb2c3a9d6cc1d2d3c91ecd6274e471e4060 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Mon, 15 May 2017 14:43:26 -0400
Subject: [PATCH] Current State

---
 src/gpuarray/reduction.h |   91 +-
 src/gpuarray_reduction.c | 1827 ++++++++++++++++++++++++++------------
 tests/check_reduction.c  |   86 +-
 3 files changed, 1309 insertions(+), 695 deletions(-)

diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
index 1db5664535..f6638c9a83 100644
--- a/src/gpuarray/reduction.h
+++ b/src/gpuarray/reduction.h
@@ -26,28 +26,26 @@ extern "C" {
  */
 
 typedef enum _ga_reduce_op {
-	GA_REDUCE_SUM,             /*        +        */
-	GA_REDUCE_PROD,            /*        *        */
-	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
-	GA_REDUCE_MIN,             /*      min()      */
-	GA_REDUCE_MAX,             /*      max()      */
-	GA_REDUCE_ARGMIN,          /*     argmin()    */
-	GA_REDUCE_ARGMAX,          /*     argmax()    */
-	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
-	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
-	GA_REDUCE_AND,             /*        &        */
-	GA_REDUCE_OR,              /*        |        */
-	GA_REDUCE_XOR,             /*        ^        */
-	GA_REDUCE_ALL,             /*     &&/all()    */
-	GA_REDUCE_ANY,             /*     ||/any()    */
+	GA_REDUCE_SUM,             /*          +          */
+	GA_REDUCE_PROD,            /*          *          */
+	GA_REDUCE_PRODNZ,          /*          * (!=0)    */
+	GA_REDUCE_MIN,             /*        min()        */
+	GA_REDUCE_MAX,             /*        max()        */
+	GA_REDUCE_ARGMIN,          /*       argmin()      */
+	GA_REDUCE_ARGMAX,          /*       argmax()      */
+	GA_REDUCE_MINANDARGMIN,    /*   min(), argmin()   */
+	GA_REDUCE_MAXANDARGMAX,    /*   max(), argmax()   */
+	GA_REDUCE_AND,             /*          &          */
+	GA_REDUCE_OR,              /*          |          */
+	GA_REDUCE_XOR,             /*          ^          */
+	GA_REDUCE_ALL,             /*       &&/all()      */
+	GA_REDUCE_ANY,             /*       ||/any()      */
 } ga_reduce_op;
 
 
 
 /**
- * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
- *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
- *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
+ * @brief Compute a reduction over a list of axes to reduce.
  *
  * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
  * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
@@ -55,6 +53,7 @@ typedef enum _ga_reduce_op {
  * reduction is performed over these axes, which are then removed in the
  * destination.
  *
+ * @param [in]  op         The reduction operation to perform.
  * @param [out] dst        The destination tensor. Has the same type as the source.
  * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
  * @param [in]  src        The source tensor.
@@ -81,64 +80,6 @@ typedef enum _ga_reduce_op {
  *         code otherwise.
  */
 
-GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
 GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
                                           GpuArray*       dst,
                                           GpuArray*       dstArg,
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index c9ccae66c4..8144900613 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -6,6 +6,7 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdio.h>
 #include "gpuarray/config.h"
 #include <stdlib.h>
 #include <string.h>
@@ -33,47 +34,96 @@
 
 /* Datatypes */
 
+/**
+ * @brief Axis Description.
+ */
+
+struct axis_desc{
+	int      reduxNum;
+	unsigned isReduced     : 1;
+	unsigned isHW          : 1;
+	unsigned isSW          : 1;
+	size_t   warpLen;
+	size_t   len;
+	ssize_t  srcStride,       srcOffset;
+	ssize_t  dstStride,       dstOffset;
+	ssize_t  dstArgStride,    dstArgOffset;
+	ssize_t  tmpDstStride,    tmpDstOffset;
+	ssize_t  tmpDstArgStride, tmpDstArgOffset;
+};
+typedef struct axis_desc axis_desc;
+
 /**
  *                    Reduction Kernel Generator.
- *
- * The generator produces a kernel from one of two "code models":
- *   - Large
- *   - Small
- * Which one is used depends on the size of the destination tensor and the
- * number of reductions for each destination element. A destination tensor
- * with more than SMALL_REDUX_THRESHOLD elements or more elements than
- * reductions for each element will result in use of the large code model;
- * Otherwise the small code model is used.
- *
- *
- *                         LARGE CODE MODEL:
- *
- * In the large code model, each destination element is processed by a
- * single thread.
- *
- * Each thread begins with an initial value in a register, reads from all
- * source elements contributing to the reduction, computes the result and
- * writes it to the destination element.
- *
- * A single kernel is generated that performs prescalar transformations, the
- * reduction itself, postscalar transformations and the write to global memory.
- *
- *
- *                         SMALL CODE MODEL:
- *
- * In the small code model, each destination element is processed by
- * multiple threads.
- *
- * The destination tensor is first initialized with the initial value. Then,
- * one several threads cooperate to perform the reduction atomically on each
- * destination element. Lastly, postscalar transformations are applied
- * in-place.
- *
- * Two or three kernels are generated: The initialization kernel, the main
- * kernel that performs prescalar transformations and the reduction itself, and
- * possibly also a postscalar transformation kernel when it is required.
- *
- *
+ * 
+ * INTRO
+ * 
+ * Generates the source code for a reduction kernel over arbitrarily-dimensioned,
+ * -shaped and -typed tensors.
+ * 
+ * 
+ * GOALS
+ * 
+ * The generator has the following goals:
+ * 
+ *   1. Maximizing the use of coalesced memory loads within a warp.
+ *   2. Maximizing the # of useful threads within a warp.
+ *   3. Maximizing the number of warps within a block.
+ * 
+ *   NOTE: It is possible to guarantee for any tensor problem of at least
+ *         2*WARP_SIZE in scale that either
+ *         1. All warp blocks in the X dimension have more than 50% threads
+ *            active 100% of the time, or
+ *         2. The warp blocks in the X dimension have 100% threads active more
+ *            than 50% of the time.
+ * 
+ *   4. Ensuring there are no more blocks than are permitted by the warp
+ *      configuration and 2nd-stage workspace size (if required).
+ *   5. Ensuring there are no more than 5 blocks per multiprocessor.
+ *   6. Minimizing the 2nd-stage workspace (if it is required).
+ *   7. Striding the 2nd-stage workspace for maximum convenience (if it is
+ *      required). Make it contiguous.
+ * 
+ * 
+ * NOTES
+ * 
+ * Information elements required to perform reduction.
+ * 
+ *   1. Ndim, shape and dtype of src tensor
+ *   2. Ndim, shape and dtype of dst/dstArg tensors
+ *   3. GPU context
+ *   4. Number of processors
+ *   5. Warp size
+ *   6. Maximum size of block
+ *   7. Maximum size of block dimension X, Y, Z
+ *   8. Maximum size of grid
+ *   9. Maximum size of grid  dimension X, Y, Z
+ *  10. Dtype and initializer of accumulator
+ *  11. Sorted src axes for contiguous memory accesses
+ *  12. Ndim, shape and dtype of flattened src tensor
+ *  13. Number of stages (1 or 2)
+ *  14. Ndim, shape and dtype of workspace tensor
+ *  15. Warp     axes
+ *  16. Hardware axes
+ *  17. Software axes
+ *  18. Source code
+ * 
+ * Rationale for dependencies:
+ * 
+ *   1) Get the GPU context and its properties immediately, since an invalid
+ *      context is a likely error and we want to fail fast.
+ *   2) The type and initializer of the accumulator should be determined after
+ *      the context's properties have been retrieved since they provide
+ *      information about the device's natively-supported types and operations.
+ * 
+ * REFERENCES
+ * 
+ * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf
+ * 
+ * 
+ * 
+ * 
+ * 
  *                           Kernel Template:
  *
  * The following kernel code template displays the code generated for the
@@ -200,11 +250,41 @@ struct redux_ctx{
 	const int*      reduxList;
 
 	/* General. */
+	int             nds;          /* # Source              dimensions */
+	int             ndr;          /* # Reduced             dimensions */
+	int             ndd;          /* # Destination         dimensions */
+	int             ndw;          /* # Warp                dimensions */
+	int             ndp;          /* # Partial warp        dimensions */
+	int             ndf;          /* # Flattened source    dimensions */
+	int             ndt;          /* # Temporary workspace dimensions */
+	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
+	int             zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
+	size_t          prodAllAxes;  /* Product of length of all           axes in source tensor */
+	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
+	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
+	size_t          prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */
+	int             splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */
+	
+	gpucontext*     gpuCtx;
+	unsigned        numProcs;
+	size_t          warpSize;
+	size_t          maxLg;
+	size_t          maxLs[MAX_HW_DIMS];
+	size_t          maxGg;
+	size_t          maxGs[MAX_HW_DIMS];
+	
+	axis_desc*      xdSrc;
+	axis_desc*      xdSrcFlat;
+	axis_desc*      xdTmp;
+	
+	axis_desc**     xdSrcPtrs;
+	
+	int             numStages;
+	
 	GpuArray*       wsDst;
 	GpuArray*       wsDstArg;
 	int*            srcAxisList;
 	size_t*         dstDims;
-	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
 	int             srcTypeCode;
@@ -219,9 +299,6 @@ struct redux_ctx{
 	const char*     accTypeStr;
 	const char*     initValT;
 	const char*     initValK;
-	int             ndd;
-	int             ndr;
-	int             nds;
 	int             largeCodeModel;
 	strb            s;
 	srcb            srcGen;
@@ -269,186 +346,133 @@ typedef struct redux_ctx redux_ctx;
 
 
 
-/* Function prototypes */
-static int   reduxGetSumInit               (int typecode, const char** property);
-static int   reduxGetProdInit              (int typecode, const char** property);
-static int   reduxGetMinInit               (int typecode, const char** property);
-static int   reduxGetMaxInit               (int typecode, const char** property);
-static int   reduxGetAndInit               (int typecode, const char** property);
-static int   reduxGetOrInit                (int typecode, const char** property);
-static int   axisInSet                     (int                v,
-                                            const int*         set,
-                                            size_t             setLen,
-                                            size_t*            where);
-static void  appendIdxes                   (strb*              s,
-                                            const char*        prologue,
-                                            const char*        prefix,
-                                            int                startIdx,
-                                            int                endIdx,
-                                            const char*        suffix,
-                                            const char*        epilogue);
-static int   reduxCheckargs                (redux_ctx*  ctx);
-static void  reduxSelectTypes              (redux_ctx*  ctx);
-static int   reduxIsSmallCodeModel         (redux_ctx*  ctx);
-static int   reduxIsLargeCodeModel         (redux_ctx*  ctx);
-static int   reduxRequiresDst              (redux_ctx*  ctx);
-static int   reduxRequiresDstArg           (redux_ctx*  ctx);
-static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
-static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType);
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType);
-static int   reduxSelectHwAxes             (redux_ctx*  ctx);
-static int   reduxComputeAxisList          (redux_ctx*  ctx);
-static int   reduxGenSource                (redux_ctx*  ctx);
-static void  reduxAppendSource             (redux_ctx*  ctx);
-static void  reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
-                                            const char* type,
-                                            const char* baseName);
-static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
-                                            const char* baseName);
-static void  reduxAppendMacroDefs          (redux_ctx*  ctx);
-static void  reduxAppendTypedefs           (redux_ctx*  ctx);
-static void  reduxAppendGetInitValFns      (redux_ctx*  ctx);
-static void  reduxAppendWriteBackFn        (redux_ctx*  ctx);
-static void  reduxAppendReduxKernel        (redux_ctx*  ctx);
-static void  reduxAppendPrototype          (redux_ctx*  ctx);
-static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx);
-static void  reduxAppendRangeCalculations  (redux_ctx*  ctx);
-static void  reduxAppendLoops              (redux_ctx*  ctx);
-static void  reduxAppendInitKernel         (redux_ctx*  ctx);
-static void  reduxAppendPostKernel         (redux_ctx*  ctx);
-static int   reduxCompile                  (redux_ctx*  ctx);
-static int   reduxSchedule                 (redux_ctx*  ctx);
-static void  reduxScheduleKernel           (int         ndims,
-                                            uint64_t*   dims,
-                                            uint64_t    warpSize,
-                                            uint64_t    maxLg,
-                                            uint64_t*   maxLs,
-                                            uint64_t    maxGg,
-                                            uint64_t*   maxGs,
-                                            uint64_t*   bs,
-                                            uint64_t*   gs,
-                                            uint64_t*   cs);
-static int   reduxInvoke                   (redux_ctx*  ctx);
-static int   reduxCleanup                  (redux_ctx*  ctx, int ret);
+/* Static Function prototypes */
+/* Utilities */
+static int        reduxGetSumInit               (int typecode, const char** property);
+static int        reduxGetProdInit              (int typecode, const char** property);
+static int        reduxGetMinInit               (int typecode, const char** property);
+static int        reduxGetMaxInit               (int typecode, const char** property);
+static int        reduxGetAndInit               (int typecode, const char** property);
+static int        reduxGetOrInit                (int typecode, const char** property);
+static int        reduxSortFlatSensitive        (const void* a, const void* b);
+static int        reduxSortFlatInsensitive      (const void* a, const void* b);
+static int        reduxSortWarp                 (const void* a, const void* b);
+static int        axisInSet                     (int         v,
+                                                 const int*  set,
+                                                 size_t      setLen,
+                                                 size_t*     where);
+static void       appendIdxes                   (strb*       s,
+                                                 const char* prologue,
+                                                 const char* prefix,
+                                                 int         startIdx,
+                                                 int         endIdx,
+                                                 const char* suffix,
+                                                 const char* epilogue);
+
+/* Axis Description API */
+static void       axisInit                      (axis_desc*       axis,
+                                                 ssize_t          len,
+                                                 ssize_t          srcStride);
+static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum);
+static void       axisMarkWarp                  (axis_desc*       axis, size_t partialSlice);
+static int        axisGetReduxNum               (const axis_desc* axis);
+static size_t     axisGetLen                    (const axis_desc* axis);
+static ssize_t    axisGetSrcStride              (const axis_desc* axis);
+static size_t     axisGetSrcAbsStride           (const axis_desc* axis);
+static ssize_t    axisGetSrcOffset              (const axis_desc* axis);
+static ssize_t    axisGetDstStride              (const axis_desc* axis);
+static size_t     axisGetDstAbsStride           (const axis_desc* axis);
+static ssize_t    axisGetDstOffset              (const axis_desc* axis);
+static ssize_t    axisGetDstArgStride           (const axis_desc* axis);
+static size_t     axisGetDstArgAbsStride        (const axis_desc* axis);
+static ssize_t    axisGetDstArgOffset           (const axis_desc* axis);
+static int        axisIsReduced                 (const axis_desc* axis);
+static int        axisIsWarp                    (const axis_desc* axis);
+static int        axisIsPartialWarp             (const axis_desc* axis);
+
+/* Reduction Context API */
+/*     Utilities */
+static int        reduxRequiresDst              (const redux_ctx*  ctx);
+static int        reduxRequiresDstArg           (const redux_ctx*  ctx);
+static int        reduxKernelRequiresDst        (const redux_ctx*  ctx);
+static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx);
+static int        reduxIsSensitive              (const redux_ctx*  ctx);
+static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx);
+static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx);
+static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i);
+static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i);
+static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i);
+static int        reduxTryFlattenInto           (const redux_ctx* ctx,
+                                                 axis_desc*       into,
+                                                 const axis_desc* from);
+static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType);
+static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType);
+/*     Control Flow */
+static int        reduxInit                     (redux_ctx*  ctx);
+static int        reduxInferProperties          (redux_ctx*  ctx);
+static int        reduxFlattenSource            (redux_ctx*  ctx);
+static int        reduxSelectWarpAxes           (redux_ctx*  ctx);
+static int        reduxSelectNumStages          (redux_ctx*  ctx);
+static int        reduxSelectHwAxes             (redux_ctx*  ctx);
+static int        reduxComputeAxisList          (redux_ctx*  ctx);
+static int        reduxGenSource                (redux_ctx*  ctx);
+static void       reduxAppendSource             (redux_ctx*  ctx);
+static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
+                                                 const char* type,
+                                                 const char* baseName);
+static void       reduxAppendTensorCallArgs     (redux_ctx*  ctx,
+                                                 const char* baseName);
+static void       reduxAppendMacroDefs          (redux_ctx*  ctx);
+static void       reduxAppendTypedefs           (redux_ctx*  ctx);
+static void       reduxAppendGetInitValFns      (redux_ctx*  ctx);
+static void       reduxAppendWriteBackFn        (redux_ctx*  ctx);
+static void       reduxAppendReduxKernel        (redux_ctx*  ctx);
+static void       reduxAppendPrototype          (redux_ctx*  ctx);
+static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx);
+static void       reduxAppendRangeCalculations  (redux_ctx*  ctx);
+static void       reduxAppendLoops              (redux_ctx*  ctx);
+static void       reduxAppendInitKernel         (redux_ctx*  ctx);
+static void       reduxAppendPostKernel         (redux_ctx*  ctx);
+static int        reduxCompile                  (redux_ctx*  ctx);
+static int        reduxSchedule                 (redux_ctx*  ctx);
+static void       reduxScheduleKernel           (int         ndims,
+                                                 uint64_t*   dims,
+                                                 uint64_t    warpSize,
+                                                 uint64_t    maxLg,
+                                                 uint64_t*   maxLs,
+                                                 uint64_t    maxGg,
+                                                 uint64_t*   maxGs,
+                                                 uint64_t*   bs,
+                                                 uint64_t*   gs,
+                                                 uint64_t*   cs);
+static int        reduxInvoke                   (redux_ctx*  ctx);
+static int        reduxCleanup                  (redux_ctx*  ctx, int ret);
+static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
+                                                 const char* fmt, ...);
 
 
 /* Function implementation */
-GPUARRAY_PUBLIC int  GpuArray_sum         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_SUM,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_prod        (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_PROD,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_prodnz      (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_PRODNZ,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_min         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MIN,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_max         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MAX,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_argmin      (GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ARGMIN,
-	                          NULL, dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_argmax      (GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ARGMAX,
-	                          NULL, dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_minandargmin(GpuArray*       dst,
-                                           GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MINANDARGMIN,
-	                          dst,  dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_maxandargmax(GpuArray*       dst,
-                                           GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX,
-	                          dst,  dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_and         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_AND,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_or          (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_OR,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_xor         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_XOR,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_all         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ALL,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_any         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ANY,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
 GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
                                            GpuArray*       dst,
                                            GpuArray*       dstArg,
                                            const GpuArray* src,
                                            unsigned        reduxLen,
                                            const unsigned* reduxList){
-	redux_ctx  ctxSTACK = {op, dst, dstArg, src,
-	                       (int)reduxLen, (const int*)reduxList};
-	redux_ctx *ctx      = &ctxSTACK;
+	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
+	memset(ctx, 0, sizeof(*ctx));
 
-	return reduxCheckargs(ctx);
+	ctx->op        = op;
+	ctx->dst       = dst;
+	ctx->dstArg    = dstArg;
+	ctx->src       = src;
+	ctx->reduxLen  = reduxLen;
+	ctx->reduxList = (const int*)reduxList;
+
+	return reduxInit(ctx);
 }
 
 /**
@@ -462,7 +486,7 @@ GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetSumInit               (int typecode, const char** property){
+static int        reduxGetSumInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -482,7 +506,7 @@ static int   reduxGetSumInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetProdInit              (int typecode, const char** property){
+static int        reduxGetProdInit              (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -502,7 +526,7 @@ static int   reduxGetProdInit              (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetMinInit               (int typecode, const char** property){
+static int        reduxGetMinInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BYTE2:
 		case GA_BYTE3:
@@ -592,7 +616,7 @@ static int   reduxGetMinInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetMaxInit               (int typecode, const char** property){
+static int        reduxGetMaxInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BOOL:
 		  *property = "1";
@@ -691,7 +715,7 @@ static int   reduxGetMaxInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetAndInit               (int typecode, const char** property){
+static int        reduxGetAndInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -711,7 +735,7 @@ static int   reduxGetAndInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetOrInit                (int typecode, const char** property){
+static int        reduxGetOrInit                (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -720,6 +744,110 @@ static int   reduxGetOrInit                (int typecode, const char** property)
 	return GA_NO_ERROR;
 }
 
+/**
+ * @brief Sort the axes into optimal order for flattening.
+ * 
+ * Two orderings exist: "Sensitive" and "Insensitive", for reductions that are
+ * sensitive (or not) to indexing.
+ * 
+ * In all cases:
+ * 
+ *   1. Free axes are sorted before reduction axes.
+ *   2. Free axes are sorted by decreasing absolute stride.
+ *   3.                 then by increasing source axis number.
+ * 
+ * In the sensitive case:
+ * 
+ *   4. Reduction axes are sorted by their position in reduxList.
+ * 
+ * In the insensitive case:
+ * 
+ *   4. Reduction axes are sorted by decreasing absolute stride.
+ *   5.                      then by increasing source axis number.
+ */
+
+static int        reduxSortFlatInsensitive      (const void* a, const void* b){
+	const axis_desc* xda  = (const axis_desc*)a;
+	const axis_desc* xdb  = (const axis_desc*)b;
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+		return +1;
+	}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+		return -1;
+	}
+
+	return 0;
+}
+static int        reduxSortFlatSensitive        (const void* a, const void* b){
+	const axis_desc* xda  = (const axis_desc*)a;
+	const axis_desc* xdb  = (const axis_desc*)b;
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return -1;
+	}
+
+	if (axisIsReduced(xda)){
+		return axisGetReduxNum(xda)<axisGetReduxNum(xdb) ? -1 : +1;
+	}else{
+		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+			return +1;
+		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+			return -1;
+		}
+		
+		return 0;
+	}
+}
+
+/**
+ * @brief Sort axes in preferred order for integration into warp.
+ * 
+ * The axes with stride != 0 are sorted by lowest absolute
+ * stride. Picking the few axes with the lowest absolute stride (while
+ * keeping the product of their dimensions <= warpSize) should maximize
+ * memory bandwidth of the warp.
+ * 
+ * The restriction stride != 0 is intended to avoid waste of memory
+ * bandwidth. Once a memory transaction is necessary, it typically operates at
+ * far greater granularity than just 32 bits (4 bytes).
+ * 
+ * Sorting by absolute stride should result, in the case of a packed tensor, in
+ * the memory accesses being close to perfectly contiguous.
+ */
+
+static int        reduxSortWarp                 (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const *)a;
+	const axis_desc* xdb  = *(const axis_desc* const *)b;
+
+	if       ( axisGetSrcStride(xda)   && !axisGetSrcStride(xdb)){
+		return -1;
+	}else if (!axisGetSrcStride(xda)   &&  axisGetSrcStride(xdb)){
+		return +1;
+	}
+	
+	if       (axisGetSrcAbsStride(xda)    <   axisGetSrcAbsStride(xdb)){
+		return -1;
+	}else if (axisGetSrcAbsStride(xda)    >   axisGetSrcAbsStride(xdb)){
+		return +1;
+	}
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return -1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return +1;
+	}
+
+	return 0;
+}
+
 /**
  * @brief Check whether axis numbered v is already in the given set of axes.
  *
@@ -730,10 +858,10 @@ static int   reduxGetOrInit                (int typecode, const char** property)
  * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
  */
 
-static int   axisInSet                     (int                v,
-                                            const int*         set,
-                                            size_t             setLen,
-                                            size_t*            where){
+static int        axisInSet                     (int         v,
+                                                 const int*  set,
+                                                 size_t      setLen,
+                                                 size_t*     where){
 	size_t i;
 
 	for (i=0;i<setLen;i++){
@@ -762,13 +890,13 @@ static int   axisInSet                     (int                v,
  * @param [in]  epilogue  Text that is appended and NOT repeated.
  */
 
-static void  appendIdxes                   (strb*              s,
-                                            const char*        prologue,
-                                            const char*        prefix,
-                                            int                startIdx,
-                                            int                endIdx,
-                                            const char*        suffix,
-                                            const char*        epilogue){
+static void       appendIdxes                   (strb*       s,
+                                                 const char* prologue,
+                                                 const char* prefix,
+                                                 int         startIdx,
+                                                 int         endIdx,
+                                                 const char* suffix,
+                                                 const char* epilogue){
 	int i;
 
 	prologue = prologue ? prologue : "";
@@ -783,23 +911,420 @@ static void  appendIdxes                   (strb*              s,
 	strb_appends(s, epilogue);
 }
 
+/* Axis Description API */
+
+/**
+ * @brief Initialize Axis Description.
+ */
+
+static void       axisInit                      (axis_desc*       axis,
+                                                 ssize_t          len,
+                                                 ssize_t          srcStride){
+	memset(axis, 0, sizeof(*axis));
+	
+	axis->reduxNum        = -1;
+	axis->warpLen         = 0;
+	axis->len             = len;
+	
+	axis->srcStride       = srcStride;
+	axis->srcOffset       = 0;
+	
+	axis->dstStride       = 0;
+	axis->dstOffset       = 0;
+	
+	axis->dstArgStride    = 0;
+	axis->dstArgOffset    = 0;
+	
+	axis->tmpDstStride    = 0;
+	axis->tmpDstOffset    = 0;
+	
+	axis->tmpDstArgStride = 0;
+	axis->tmpDstArgOffset = 0;
+}
+
 /**
- * @brief Check the sanity of the arguments in agreement with the
- *        documentation for GpuArray_reduction().
+ * @brief Mark axis as reduction axis, with position reduxNum in the axis list.
+ */
+
+static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum){
+	axis->isReduced = 1;
+	axis->reduxNum  = reduxNum;
+}
+
+/**
+ * @brief Mark axis as warp axis.
+ */
+
+static void       axisMarkWarp                  (axis_desc*       axis, size_t warpLen){
+	axis->warpLen = warpLen;
+}
+
+/**
+ * @brief Get properties of an axis.
+ */
+
+static int        axisGetReduxNum               (const axis_desc* axis){
+	return axis->reduxNum;
+}
+static size_t     axisGetLen                    (const axis_desc* axis){
+	return axis->len;
+}
+static ssize_t    axisGetSrcStride              (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
+}
+static size_t     axisGetSrcAbsStride           (const axis_desc* axis){
+	return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis):
+	                                  +(size_t)axisGetSrcStride(axis);
+}
+static ssize_t    axisGetSrcOffset              (const axis_desc* axis){
+	return axis->srcOffset;
+}
+static ssize_t    axisGetDstStride              (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->dstStride : 0;
+}
+static size_t     axisGetDstAbsStride           (const axis_desc* axis){
+	return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis):
+	                                  +(size_t)axisGetDstStride(axis);
+}
+static ssize_t    axisGetDstOffset              (const axis_desc* axis){
+	return axis->dstOffset;
+}
+static ssize_t    axisGetDstArgStride           (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->dstArgStride : 0;
+}
+static size_t     axisGetDstArgAbsStride        (const axis_desc* axis){
+	return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis):
+	                                     +(size_t)axisGetDstArgStride(axis);
+}
+static ssize_t    axisGetDstArgOffset           (const axis_desc* axis){
+	return axis->dstArgOffset;
+}
+static int        axisIsReduced                 (const axis_desc* axis){
+	return axis->isReduced;
+}
+static int        axisIsWarp                    (const axis_desc* axis){
+	return !!axis->warpLen;
+}
+static int        axisIsPartialWarp             (const axis_desc* axis){
+	return axis->warpLen > 0 && axis->warpLen != axis->len;
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dst argument.
+ */
+
+static int        reduxRequiresDst              (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 0;
+		default:
+		  return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dstArg argument.
+ */
+
+static int        reduxRequiresDstArg           (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dst
+ *        argument.
  *
- *        Also initialize certain parts of the context, allocate memory
- *        buffers and fail out if at any point the environment gives us
- *        a problem.
+ * This is semantically subtly different from reduxHasDst(). The main
+ * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
+ * reductions; Either *might* require a dst buffer, which will have to be
+ * allocated, even though it will be discared.
+ */
+
+static int        reduxKernelRequiresDst        (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return reduxIsSmallCodeModel(ctx);
+		default:
+		  return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dstArg
+ *        argument.
  *
- * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of
- *         memory, GA_NO_ERROR otherwise.
+ * This is semantically subtly different from reduxHasDstArg(), since it asks
+ * whether the reduction, even though it does not accept a dstArg argument,
+ * still requires a dstArg internally.
+ */
+
+static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx){
+	/**
+	 * At present there exists no reduction whose implementation requires
+	 * a dstArg but whose interface does not.
+	 *
+	 * E.g. the max() and min() reductions do NOT currently require a temporary
+	 *      buffer for indexes, and will not in the foreseeable future.
+	 */
+
+	return reduxRequiresDstArg(ctx);
+}
+
+/**
+ * @brief Returns whether the reduction is sensitive.
+ * 
+ * A reduction is sensitive when its output satisfies at least one of the
+ * following conditions:
+ * 
+ *   - It depends on the exact order of axes in the reduxList
+ *   - It depends on exact signs of the strides of axes in the reduxList
+ * 
+ * Such sensitivity may prevent a flattening of contiguous axes even when it
+ * would have been otherwise permitted.
+ * 
+ * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
+ * tensor's contents are flattened coordinates into the source tensor, and
+ * the flattening order is precisely reduxList. Permuting it would thus produce
+ * incorrect output. Moreover, if the strides of a reduction axis were to be
+ * reversed for the purpose of flattening the axis into another, the computed
+ * coordinate would again be incorrect.
+ * 
+ * 
+ * TL;DR: Reduction is sensitive if
+ *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
+ * or
+ *   reduce(x) != reduce(x[::-1])
+ * .
+ */
+
+static int        reduxIsSensitive              (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
+/**
+ * @brief Returns whether we are using the small code model or not.
+ */
+
+static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx){
+	return !reduxIsLargeCodeModel(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the large code model or not.
+ */
+
+static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx){
+	return ctx->largeCodeModel;
+}
+
+/**
+ * @brief Get description of source axis with given number.
+ */
+
+static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i){
+	return &ctx->xdSrc[i];
+}
+
+/**
+ * @brief Get description of source axis with given number in sort-order.
  */
 
-static int   reduxCheckargs                (redux_ctx*  ctx){
-	int      i, j, ret, retT, retK;
-	unsigned numProcs;
-	size_t   localSize;
-	size_t   dstNumElem = 1, reduxPerElem = 1;
+static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i){
+	return ctx->xdSrcPtrs[i];
+}
+
+/**
+ * @brief Get description of flattened source axis with given number.
+ */
+
+static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i){
+	return &ctx->xdSrcFlat[i];
+}
+
+/**
+ * @brief Attempt to flatten an axis `from` into an axis `into`.
+ * 
+ * An axis can be considered for flattening into the previous one if ALL of
+ * the following conditions hold:
+ * 
+ *   1. The product of the previous axis' length by its stride exactly
+ *      matches the current axis' stride.
+ *   2. Both axes are reduced.
+ * 
+ * For reductions where axis order matters (e.g. those that compute
+ * indices, like argmax/argmin), ALL of the following additional conditions
+ * must hold:
+ * 
+ *   3. The sign of the strides must match.
+ *   4. The axis numbers must follow consecutively in the reduction list
+ *      (this is ensured by the "sensitive" sort order)
+ * 
+ * @return Non-zero if flattening attempt successful; Zero otherwise.
+ */
+
+static int        reduxTryFlattenInto           (const redux_ctx* ctx,
+                                                 axis_desc*       into,
+                                                 const axis_desc* from){
+	int signSrc    = 0, signDst    = 0, signDstArg    = 0,
+	    reverseSrc = 0, reverseDst = 0, reverseDstArg = 0;
+	
+	if (axisIsReduced         (into) != axisIsReduced         (from)                 ||
+	    axisGetSrcAbsStride   (into) != axisGetSrcAbsStride   (from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	if (reduxRequiresDst(ctx) &&
+	    axisGetDstAbsStride   (into) != axisGetDstAbsStride   (from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	if (reduxRequiresDstArg(ctx) &&
+	    axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	signSrc       = (axisGetSrcStride   (into)^axisGetSrcStride   (from)) < 0;
+	signDst       = (axisGetDstStride   (into)^axisGetDstStride   (from)) < 0;
+	signDstArg    = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0;
+	reverseSrc    = signSrc;
+	reverseDst    = signDst    && reduxRequiresDst   (ctx);
+	reverseDstArg = signDstArg && reduxRequiresDstArg(ctx);
+	
+	if (reduxIsSensitive(ctx)){
+		if(reverseSrc || reverseDst || reverseDstArg){
+			return 0;
+		}
+	}
+	
+	if (reduxRequiresDst   (ctx) &&
+	    reduxRequiresDstArg(ctx) &&
+	    reverseDst != reverseDstArg){
+		/* Either both, or neither, of dst and dstArg must require reversal. */
+		return 0;
+	}
+	
+	if (reverseSrc){
+		into->srcOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from);
+		into->srcStride     = -axisGetSrcStride   (from);
+	}else{
+		into->srcStride     =  axisGetSrcStride   (from);
+	}
+	
+	if (reverseDst){
+		into->dstOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from);
+		into->dstStride     = -axisGetDstStride   (from);
+	}else{
+		into->dstStride     =  axisGetDstStride   (from);
+	}
+	
+	if (reverseDstArg){
+		into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from);
+		into->dstArgStride  = -axisGetDstArgStride(from);
+	}else{
+		into->dstArgStride  =  axisGetDstArgStride(from);
+	}
+	
+	into->srcOffset    += axisGetSrcOffset   (from);
+	into->dstOffset    += axisGetDstOffset   (from);
+	into->dstArgOffset += axisGetDstArgOffset(from);
+	into->len          *= axisGetLen         (from);
+	
+	return 1;
+}
+
+/**
+ * @brief Check whether we can add another reduction axis or free axis
+ *        to the hardware axis list for either the primary or secondary kernel.
+ */
+
+static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType){
+	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
+	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
+	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
+
+	if (kernelNdh >= MAX_HW_DIMS){
+		return 0;
+	}else{
+		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
+		                                kernelNdhd < ctx->ndd;
+	}
+}
+
+/**
+ * @brief Append the largest reduction axis or free axis that isn't yet
+ *        in the hardware axis list for either the primary or secondary kernel
+ *        into said hardware axis list.
+ */
+
+static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType){
+	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
+	int*   hwAxisList, * ndh, * ndhr, * ndhd;
+	size_t v, maxV = 0;
+
+	/* Get pointers to the correct kernel's variables */
+	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
+	                                             ctx->aux.axisList;
+	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
+	                                            &ctx->aux.ndh;
+	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
+	                                            &ctx->aux.ndhr;
+	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
+	                                            &ctx->aux.ndhd;
+
+	/* Find */
+	for (i=0;i<ctx->nds;i++){
+		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
+		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
+		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
+		                                           !isInReduxList;
+		v               = ctx->src->dimensions[i];
+		isLargestSoFar  = v >= maxV;
+
+		if (!isInHwList && isInDesiredList && isLargestSoFar){
+			maxV = v;
+			maxI = i;
+		}
+	}
+
+	/* Append */
+	hwAxisList[(*ndh)++] = maxI;
+	if (axisType == AXIS_REDUX){
+		(*ndhr)++;
+	}else{
+		(*ndhd)++;
+	}
+}
+
+/**
+ * @brief Initialize the context.
+ * 
+ * After this function, calling reduxCleanup() becomes safe.
+ */
+
+static int        reduxInit                     (redux_ctx*  ctx){
+	int i;
 
 	/**
 	 * We initialize certain parts of the context.
@@ -813,15 +1338,16 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
 	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
-	ctx->initValK       = NULL;
-	ctx->pri.ndh       = ctx->aux.ndh  = 0;
-	ctx->pri.ndhd      = ctx->aux.ndhd = 0;
-	ctx->pri.ndhr      = ctx->aux.ndhr = 0;
+	ctx->initValK      = NULL;
 	ctx->sourceCode    = NULL;
-	ctx->sourceCodeLen = 0;
 	ctx->errorString0  = NULL;
 	ctx->errorString1  = NULL;
 	ctx->errorString2  = NULL;
+
+	ctx->splitWarpAxis = -1;
+	ctx->numStages     =  1;
+	ctx->prodWarpAxes  =  1;
+	ctx->prodAllAxes   = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
 	strb_init(&ctx->s);
 	srcbInit (&ctx->srcGen, &ctx->s);
 
@@ -835,219 +1361,134 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->srcStepsGD      = ctx->srcSizeGD       =
 	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
 	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
-	/* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */
-
 
-	/* Insane src, reduxLen, dst or dstArg? */
-	if (!ctx->src                                                      ||
-	    (reduxRequiresDst   (ctx) && !ctx->dst)                        ||
-	    (reduxRequiresDstArg(ctx) && !ctx->dstArg)                     ||
-	    (ctx->src->nd  <= 0)                                           ||
-	    (ctx->reduxLen <= 0)                                           ||
-	    (ctx->src->nd  <  (unsigned)ctx->reduxLen)                     ||
-	    (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd) ||
-	    (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
-
-
-	/* Insane or duplicate list entry? */
-	for (i=0;i<ctx->reduxLen;i++){
-		if (ctx->reduxList[i] <  0                            ||
-		    ctx->reduxList[i] >= (int)ctx->src->nd            ||
-		    axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
-			return reduxCleanup(ctx, GA_INVALID_ERROR);
-		}
-	}
+	return reduxInferProperties(ctx);
+}
 
+/**
+ * @brief Begin inferring the properties of the reduction.
+ */
 
-	/* GPU context non-existent? */
-	ctx->gpuCtx     = GpuArray_context(ctx->src);
-	if (!ctx->gpuCtx){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
+static int        reduxInferProperties          (redux_ctx*  ctx){
+	axis_desc* a;
+	int        i, j, retT, retK;
+	size_t     d;
 
 
-	/* Unknown type? */
-	reduxSelectTypes(ctx);
-	if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
-	    !ctx->accTypeStr){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	/* Source code buffer preallocation failed? */
+	if (strb_ensure(&ctx->s, 4*1024) != 0){
+		return reduxCleanupMsg(ctx, GA_MEMORY_ERROR,
+		    "Could not preallocate source code buffer!\n");
 	}
 
 
-	/* Determine initializer, and error out if reduction unsupported. */
-	switch (ctx->op){
-		case GA_REDUCE_SUM:
-		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_PRODNZ:
-		case GA_REDUCE_PROD:
-		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_MIN:
-		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMAX:
-		case GA_REDUCE_MAX:
-		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_ALL:
-		case GA_REDUCE_AND:
-		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_ANY:
-		case GA_REDUCE_XOR:
-		case GA_REDUCE_OR:
-		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
-		break;
-		default:
-		  retT = GA_UNSUPPORTED_ERROR;
-		  retK = GA_UNSUPPORTED_ERROR;
-	}
-	if (retT != GA_NO_ERROR){
-		return reduxCleanup(ctx, retT);
-	}
-	if (retK != GA_NO_ERROR){
-		return reduxCleanup(ctx, retK);
+	/* Insane src, reduxLen, dst or dstArg? */
+	if       (!ctx->src){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src is NULL!\n");
+	}else if (ctx->src->nd  <= 0){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src has less than 1 dimensions!\n");
+	}else if (ctx->reduxLen <= 0){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "List of dimensions to be reduced is empty!\n");
+	}else if (ctx->src->nd  <  (unsigned)ctx->reduxLen){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src has fewer dimensions than there are dimensions to reduce!\n");
+	}else if (reduxRequiresDst   (ctx) && !ctx->dst){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dst is NULL, but reduction requires it!\n");
+	}else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dstArg is NULL, but reduction requires it!\n");
+	}else if (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dst is of incorrect dimensionality for this reduction!\n");
+	}else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dstArg is of incorrect dimensionality for this reduction!\n");
 	}
-
-
-	/**
-	 * We initialize some more parts of the context, using the guarantees
-	 * we now have about the sanity of the arguments.
-	 */
-
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
-	strb_ensure(&ctx->s, 3*1024);
-
-
-	/**
-	 * And make a few small dynamic memory allocations for the benefit of the
-	 * rest of the code, allowing error checking to happen early and fail fast.
-	 */
-
-	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
-	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
-	if (!ctx->srcAxisList ||
-	    !ctx->dstDims     ){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	ctx->ndw = 0;
+	ctx->ndp = 0;
+	ctx->ndf = 0;
+	ctx->ndt = ctx->ndd + 1;
+	
+	/* Insane reduxList? */
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList[i];
+		if (j < -ctx->nds || j >= ctx->nds){
+			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+			    "Insane axis number %d! Should be [%d, %d)!\n",
+			    j, -ctx->nds, ctx->nds);
+		}
+		j = j<0 ? ctx->nds+j : j;
+		d                 = ctx->src->dimensions[j];
+		ctx->zeroRdxAxes += !d;
+		ctx->prodRdxAxes *=  d?d:1;
 	}
 
 
 	/**
-	 * Query device for approximate total level of parallelism. If destination
-	 * tensor is so big it can keep all threads busy on individual elements,
-	 * use large code model; Otherwise use small code model, where threads will
-	 * have to cooperate.
-	 *
-	 *    - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
-	 *             destination tensor size >= # of reductions per destination
-	 *             tensor element):
-	 *        All destination elements have their own thread.
-	 *    - Small (otherwise):
-	 *        Multiple threads cooperate on a single destination element.
+	 * Insane shape?
+	 * 
+	 * The source tensor is allowed to be empty (its shape may contain 0s).
+	 * However, all axes that are of length 0 must be reduction axes.
+	 * 
+	 * The reason for this is that a reduction cannot store any output into an
+	 * empty destination tensor (whose dimensions are the free axes), because
+	 * it has 0 space. The operation cannot then fulfill its contract.
+	 * 
+	 * On the other hand, when some or all reduction axes of a tensor are of
+	 * length 0, the reduction can be interpreted as initializing the
+	 * destination tensor to the identity value of the operation. For lack of a
+	 * better idea, the destination argument tensor can then be zeroed.
 	 */
 
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
+	for (i=0;i<ctx->nds;i++){
+		d                  = ctx->src->dimensions[i];
+		ctx->zeroAllAxes += !d;
+		ctx->prodAllAxes *=  d?d:1;
 	}
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
+	if (ctx->zeroAllAxes != ctx->zeroRdxAxes){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "Source tensor has length-0 dimensions that are not reduced!");
 	}
+	ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes;
 
-	for (i=j=0;i<ctx->nds;i++){
-		if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
-			reduxPerElem *= ctx->src->dimensions[i];
-		}else{
-			dstNumElem   *= ctx->src->dimensions[i];
-			ctx->dstDims[j++] = ctx->src->dimensions[i];;
-		}
-	}
 
-	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
-	                      dstNumElem >= reduxPerElem
-	                      || 1;/* BUG: Erase when small code model implemented. */
 	/**
-	 * *** IT IS NOW SAFE TO CALL: ***
-	 *       - reduxIsLargeModel()
-	 *       - reduxIsSmallModel()
-	 *       - reduxKernelRequiresDst()
-	 *       - reduxKernelRequiresDstArg()
+	 * GPU context non-existent, or cannot read its properties?
 	 */
 
-
-	/**
-	 * Allocate workspaces.
-	 *
-	 * Certain reductions may require a workspace that isn't provided by the user.
-	 * For instance, **when using the small code model**, argmin/argmax require
-	 * a dst buffer, but the user didn't supply one (as he would have for
-	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
-	 *
-	 * Otherwise we use the user-supplied buffers.
-	 */
-
-	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
-		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
-		if (!ctx->wsDst){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
-		}
-
-		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
-		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}else{
-		ctx->wsDst    = ctx->dst;
-	}
-	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
-		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
-		if (!ctx->wsDstArg){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
-		}
-
-		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
-		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}else{
-		ctx->wsDstArg = ctx->dstArg;
+	ctx->gpuCtx = GpuArray_context(ctx->src);
+	if (!ctx->gpuCtx                                                                           ||
+	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS,  &ctx->numProcs) != GA_NO_ERROR ||
+	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE,  &ctx->maxLg)    != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE,  &ctx->maxGg)    != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){
+		/* gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize); */
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "Error obtaining one or more properties from GPU context!\n");
 	}
+	ctx->warpSize = 32;
 
 
+	/**
+	 * Type management.
+	 * 
+	 * - Deal with the various typecodes.
+	 * - Determine initializer and error out if reduction unsupported on that
+	 *   datatype.
+	 */
 
-	return reduxSelectHwAxes(ctx);
-}
-
-/**
- * @brief Select types for the reduction kernel's implementation.
- *
- * There are 5 types of relevance:
- *   - Source                   (S=Source)
- *   - Destination              (T=Target)
- *   - Destination Argument     (A=Arg)
- *   - Index                    (X=indeX)
- *   - Accumulator              (K=aKKumulator/reduction)
- */
-
-static void  reduxSelectTypes              (redux_ctx*  ctx){
-	/* Deal with the various typecodes. */
 	ctx->srcTypeCode    = ctx->src->typecode;
 	ctx->dstTypeCode    = ctx->srcTypeCode;
 	ctx->dstArgTypeCode = GA_SSIZE;
@@ -1059,179 +1500,330 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 		case GA_HALF2:
 		  ctx->accTypeCode = GA_FLOAT2;
 		break;
-		case GA_HALF4:
-		  ctx->accTypeCode = GA_FLOAT4;
+		case GA_HALF4:
+		  ctx->accTypeCode = GA_FLOAT4;
+		break;
+		case GA_HALF8:
+		  ctx->accTypeCode = GA_FLOAT8;
+		break;
+		case GA_HALF16:
+		  ctx->accTypeCode = GA_FLOAT16;
+		break;
+		default:
+		  ctx->accTypeCode = ctx->srcTypeCode;
+	}
+	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
+	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
+	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
+	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
+	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
+	if (!ctx->srcTypeStr    ||
+	    !ctx->dstTypeStr    ||
+	    !ctx->dstArgTypeStr ||
+	    !ctx->idxTypeStr    ||
+	    !ctx->accTypeStr    ){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+	switch (ctx->op){
+		case GA_REDUCE_SUM:
+		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_PRODNZ:
+		case GA_REDUCE_PROD:
+		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MIN:
+		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAX:
+		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
 		break;
-		case GA_HALF8:
-		  ctx->accTypeCode = GA_FLOAT8;
+		case GA_REDUCE_ALL:
+		case GA_REDUCE_AND:
+		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
 		break;
-		case GA_HALF16:
-		  ctx->accTypeCode = GA_FLOAT16;
+		case GA_REDUCE_ANY:
+		case GA_REDUCE_XOR:
+		case GA_REDUCE_OR:
+		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
 		break;
 		default:
-		  ctx->accTypeCode = ctx->srcTypeCode;
+		  retT = GA_UNSUPPORTED_ERROR;
+		  retK = GA_UNSUPPORTED_ERROR;
+	}
+	if (retT != GA_NO_ERROR){
+		return reduxCleanupMsg(ctx, retT,
+		    "Problem selecting types to be used in reduction!\n");
+	}
+	if (retK != GA_NO_ERROR){
+		return reduxCleanupMsg(ctx, retK,
+		    "Problem selecting types to be used in reduction!\n");
 	}
 
-	/* Get the string version as well. */
-	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
-	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
-	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
-	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
-	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
-}
-
-/**
- * @brief Returns whether we are using the small code model or not.
- */
 
-static int   reduxIsSmallCodeModel         (redux_ctx*  ctx){
-	return !reduxIsLargeCodeModel(ctx);
-}
+	/**
+	 * Allocate and construct source-tensor axis-description lists.
+	 * 
+	 * While constructing the descriptions of each axis, verify that:
+	 * 
+	 *   1. reduxLen has no duplicates.
+	 *   2. dst and/or dstArg's dimensions match src's dimensions, stripped of
+	 *      the reduction axes.
+	 */
 
-/**
- * @brief Returns whether we are using the large code model or not.
- */
+	ctx->xdSrc     = calloc(ctx->nds, sizeof(*ctx->xdSrc));
+	ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs));
+	ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat));
+	ctx->xdTmp     = calloc(ctx->ndt, sizeof(*ctx->xdTmp));
+	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+	for (i=0;i<ctx->nds;i++){
+		axisInit(&ctx->xdSrc[i],
+		         ctx->src->dimensions[i],
+		         ctx->src->strides[i]);
+	}
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList[i];
+		j = j<0 ? ctx->nds+j : j;
+		a = reduxGetSrcAxis(ctx, j);
+		if (axisIsReduced(a)){
+			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+			                       "Axis %d appears multiple times in the "
+			                       "reduction axis list!\n",
+			                       j);
+		}
+		axisMarkReduced(a, i);
+	}
+	for (i=j=0;i<ctx->nds;i++){
+		axis_desc* a      = reduxGetSrcAxis(ctx, i);
+		size_t     srcLen = axisGetLen(a), dstLen, dstArgLen;
+		
+		if (axisIsReduced(a)){continue;}
+		if (reduxRequiresDst(ctx)){
+			dstLen = ctx->dst->dimensions[j];
+			
+			if(srcLen != dstLen){
+				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+				                       "Source axis %d has length %zu, but "
+				                       "corresponding destination axis %d has length %zu!\n",
+				                       i, srcLen, j, dstLen);
+			}
+			
+			a->dstStride    = ctx->dst->strides[j];
+		}
+		if (reduxRequiresDstArg(ctx)){
+			dstArgLen = ctx->dstArg->dimensions[j];
+			
+			if(srcLen != dstArgLen){
+				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+				                       "Source axis %d has length %zu, but "
+				                       "corresponding destination-argument axis %d has length %zu!\n",
+				                       i, srcLen, j, dstArgLen);
+			}
+			
+			a->dstArgStride = ctx->dstArg->strides[j];
+		}
+		
+		j++;
+	}
 
-static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
-	return ctx->largeCodeModel;
-}
 
-/**
- * @brief Returns whether the reduction interface requires a dst argument.
- */
+	/**
+	 * Begin flattening the source tensor.
+	 */
 
-static int   reduxRequiresDst              (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 0;
-		default:
-		  return 1;
-	}
+	return reduxFlattenSource(ctx);
 }
 
 /**
- * @brief Returns whether the reduction interface requires a dstArg argument.
+ * @brief Flatten the source tensor as much as is practical.
+ * 
+ * This makes the axis lengths as long as possible and the tensor itself as
+ * contiguous as possible.
  */
 
-static int   reduxRequiresDstArg           (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
+static int        reduxFlattenSource            (redux_ctx*  ctx){
+	axis_desc* axis, *flatAxis, *sortAxis;
+	int        i, j, isSensitive;
+	
+	/**
+	 * Copy source axis descriptions list to flattened source axis description
+	 * list, in preparation for attempts at flattening.
+	 */
+	
+	memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat));
+	ctx->ndf = ctx->nds;
+
+	/**
+	 * Pass 1: Flatten out 0-length dimensions. We already know that
+	 * 
+	 *         a) There are no 0-length free dimensions, because that
+	 *            constitutes an invalid input, and
+	 *         b) How many 0-length reduction dimensions there are, because
+	 *            we counted them in the error-checking code.
+	 * 
+	 * So if there are any 0-length axes, we can delete all reduction axes and
+	 * replace them with a single one.
+	 */
+	
+	if (ctx->zeroRdxAxes > 0){
+		for (i=j=0;i<ctx->ndf;i++){
+			axis = reduxGetSrcFlatAxis(ctx, i);
+			
+			if (!axisIsReduced(axis)){
+				*reduxGetSrcFlatAxis(ctx, j++) = *axis;
+			}
+		}
+		
+		axisInit       (reduxGetSrcFlatAxis(ctx, j), 0, 0);
+		axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0);
+		j++;
+		ctx->ndf = j;
+	}
+	
+	/**
+	 * Pass 2: Flatten out 1-length dimensions, since they can always be
+	 *         ignored; They are always indexed at [0].
+	 */
+	
+	for (i=j=0;i<ctx->ndf;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		
+		if (axisGetLen(axis) != 1){
+			*reduxGetSrcFlatAxis(ctx, j++) = *axis;
+		}
+	}
+	ctx->ndf = j;
+	
+	/**
+	 * Pass 3: Flatten out continuous dimensions, where strides and sensitivity
+	 *         allows it.
+	 */
+	
+	isSensitive = reduxIsSensitive(ctx);
+	
+	qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat),
+		  isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
+	
+	for (i=j=1;i<ctx->ndf;i++){
+		flatAxis = reduxGetSrcFlatAxis(ctx, j-1);
+		sortAxis = reduxGetSrcFlatAxis(ctx, i);
+		
+		if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){
+			*reduxGetSrcFlatAxis(ctx, j++) = *sortAxis;
+		}
 	}
+	ctx->ndf = j;
+
+	return reduxSelectWarpAxes(ctx);
 }
 
 /**
- * @brief Returns whether the generated kernel internally requires a dst
- *        argument.
- *
- * This is semantically subtly different from reduxHasDst(). The main
- * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
- * reductions; Either *might* require a dst buffer, which will have to be
- * allocated, even though it will be discared.
+ * @brief Select the warp axes in such a way as to maximize memory bandwidth.
  */
 
-static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return reduxIsSmallCodeModel(ctx);
-		default:
-		  return 1;
-	}
-}
+static int        reduxSelectWarpAxes           (redux_ctx*  ctx){
+	axis_desc* a;
+	int        i;
+	size_t     aL;
 
-/**
- * @brief Returns whether the generated kernel internally requires a dstArg
- *        argument.
- *
- * This is semantically subtly different from reduxHasDstArg(), since it asks
- * whether the reduction, even though it does not accept a dstArg argument,
- * still requires a dstArg internally.
- */
 
-static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
 	/**
-	 * At present there exists no reduction whose implementation requires
-	 * a dstArg but whose interface does not.
-	 *
-	 * E.g. the max() and min() reductions do NOT currently require a temporary
-	 *      buffer for indexes, and will not in the foreseeable future.
+	 * NOTE: At this point it is possible for there to be no axes
+	 * (ctx->ndf == 0), but this will only occur if all axes of the original
+	 * tensor were length-1 (i.e., if this was a scalar masquerading as a
+	 * multidimensional tensor).
+	 * 
+	 * We check for this case and simulate a 1-dimensional, 1-length tensor.
 	 */
 
-	return reduxRequiresDstArg(ctx);
-}
+	if(ctx->ndf == 0){
+		axisInit       (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0);
+		axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0);
+		ctx->ndf = 1;
+	}
 
-/**
- * @brief Check whether we can add another reduction axis or free axis
- *        to the hardware axis list for either the primary or secondary kernel.
- */
 
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType){
-	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
-	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
-	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
+	/**
+	 * Select Warp Axes.
+	 * 
+	 * Using a particular heuristic order (*), sort the axis list by
+	 * suitability for belonging to the warp. Then, pick the first few axes,
+	 * until the product of their lengths exceeds the warp size.
+	 * 
+	 * (*) See documentation of value-comparison function.
+	 */
 
-	if (kernelNdh >= MAX_HW_DIMS){
-		return 0;
-	}else{
-		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
-		                                kernelNdhd < ctx->ndd;
+	for(i=0;i<ctx->ndf;i++){
+		ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i);
 	}
-}
 
-/**
- * @brief Append the largest reduction axis or free axis that isn't yet
- *        in the hardware axis list for either the primary or secondary kernel
- *        into said hardware axis list.
- */
+	qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp);
 
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType){
-	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
-	int*   hwAxisList, * ndh, * ndhr, * ndhd;
-	size_t v, maxV = 0;
+	for (i=0;i<ctx->ndf;i++){
+		a  = reduxGetSrcSortAxis(ctx, i);
+		aL = axisGetLen(a);
+		if (aL <= 1){break;}
+		
+		ctx->prodWarpAxes *= aL;
+		if (ctx->prodWarpAxes <= ctx->warpSize){
+			axisMarkWarp(a, aL);
+			ctx->ndw++;
+		}else{
+			/**
+			 * The product of warp lengths just exceeded warpSize. We backtrack
+			 * by undoing the multiplication by aL. We then check whether we
+			 * can "split" this axis by extracting at least a factor of 2 into
+			 * warpLen. If yes, we mark is as the (only) warp axis that is
+			 * split by setting its warpLen to something neither 0 nor len.
+			 */
+			
+			ctx->prodWarpAxes /= aL;
+			aL = ctx->warpSize/ctx->prodWarpAxes;
+			if (aL >= 2){
+				axisMarkWarp(a, aL);
+				ctx->prodWarpAxes  *= aL;
+				ctx->splitWarpAxis  = i;
+				ctx->ndw++;
+				ctx->ndp++;
+			}
+			break;
+		}
+	}
 
-	/* Get pointers to the correct kernel's variables */
-	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
-	                                             ctx->aux.axisList;
-	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
-	                                            &ctx->aux.ndh;
-	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
-	                                            &ctx->aux.ndhr;
-	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
-	                                            &ctx->aux.ndhd;
 
-	/* Find */
-	for (i=0;i<ctx->nds;i++){
-		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
-		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
-		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
-		                                           !isInReduxList;
-		v               = ctx->src->dimensions[i];
-		isLargestSoFar  = v >= maxV;
+	return reduxSelectNumStages(ctx);
+}
 
-		if (!isInHwList && isInDesiredList && isLargestSoFar){
-			maxV = v;
-			maxI = i;
-		}
-	}
+/**
+ * @brief Select the number of stages of the reduction.
+ * 
+ * This depends a lot on the GPU and the specific size of the reduction.
+ */
 
-	/* Append */
-	hwAxisList[(*ndh)++] = maxI;
-	if (axisType == AXIS_REDUX){
-		(*ndhr)++;
+static int        reduxSelectNumStages          (redux_ctx*  ctx){
+	size_t parallelism = 2 * ctx->numProcs * ctx->maxLg;
+	
+	if(ctx->zeroRdxAxes                     || /* Reduction is empty? */
+	   ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */
+	   ctx->prodFreeAxes > parallelism      ){ /* # of destination elements large enough to fill available parallelism? */
+		ctx->numStages = 1;
 	}else{
-		(*ndhd)++;
+		ctx->numStages = 2;
 	}
+	
+	return reduxSelectHwAxes(ctx);
 }
 
 /**
@@ -1254,7 +1846,67 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
  *                   largest free axes are selected.
  */
 
-static int   reduxSelectHwAxes             (redux_ctx*  ctx){
+static int        reduxSelectHwAxes             (redux_ctx*  ctx){
+	int ret;
+	
+	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
+	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
+	if (!ctx->srcAxisList ||
+	    !ctx->dstDims     ){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+
+	ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */
+	/**
+	 * *** IT IS NOW SAFE TO CALL: ***
+	 *       - reduxIsLargeModel()
+	 *       - reduxIsSmallModel()
+	 *       - reduxKernelRequiresDst()
+	 *       - reduxKernelRequiresDstArg()
+	 */
+
+
+	/**
+	 * Allocate workspaces.
+	 *
+	 * Certain reductions may require a workspace that isn't provided by the user.
+	 * For instance, **when using the small code model**, argmin/argmax require
+	 * a dst buffer, but the user didn't supply one (as he would have for
+	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
+	 *
+	 * Otherwise we use the user-supplied buffers.
+	 */
+
+	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
+		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
+		if (!ctx->wsDst){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
+		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDst    = ctx->dst;
+	}
+	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
+		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
+		if (!ctx->wsDstArg){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
+		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDstArg = ctx->dstArg;
+	}
+
+
 	if (reduxIsLargeCodeModel(ctx)){
 		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
 			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
@@ -1346,8 +1998,8 @@ static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
 static void  reduxAppendMacroDefs          (redux_ctx*  ctx){
 	int i;
 
-	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
-	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
+	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
+	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if (i##idx >= i##idx##Dim){continue;}\n");
 
 	/* srcVal indexer */
 	srcbAppends    (&ctx->srcGen, "#define srcVal          (*(const GLOBAL_MEM S*)(");
@@ -1471,10 +2123,10 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 	reduxAppendTensorDeclArgs(ctx, "S", "src");
 	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        srcSize");
 	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        chunkSize");
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		reduxAppendTensorDeclArgs(ctx, "T", "dst");
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		reduxAppendTensorDeclArgs(ctx, "A", "dstArg");
 	}
 	srcbEndList    (&ctx->srcGen);
@@ -1519,12 +2171,12 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->srcAxisList[i]);
 	}
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		for (i=0;i<ctx->ndd;i++){
 			strb_appendf(&ctx->s, "\ti%dDStep   = dstSteps[%d];\n", i, i);
 		}
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		for (i=0;i<ctx->ndd;i++){
 			strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 		}
@@ -1614,14 +2266,14 @@ static void  reduxAppendLoops              (redux_ctx*  ctx){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MINANDARGMIN:
 		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n"
-		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\tif (rdxK == k){\n"
 		                            "\t\t\t\trdxA = rdxIdx;\n"
 		                            "\t\t\t}\n");
 		break;
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAXANDARGMAX:
 		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n"
-		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\tif (rdxK == k){\n"
 		                            "\t\t\t\trdxA = rdxIdx;\n"
 		                            "\t\t\t}\n");
 		break;
@@ -2087,14 +2739,18 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
  * Cleanup
  */
 
-static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
+static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
 	if (ctx->dst    != ctx->wsDst){
-		GpuArray_clear(ctx->wsDst);
+		if(ctx->wsDst){
+			GpuArray_clear(ctx->wsDst);
+		}
 		free(ctx->wsDst);
 		ctx->wsDst    = NULL;
 	}
 	if (ctx->dstArg != ctx->wsDstArg){
-		GpuArray_clear(ctx->wsDstArg);
+		if(ctx->wsDstArg){
+			GpuArray_clear(ctx->wsDstArg);
+		}
 		free(ctx->wsDstArg);
 		ctx->wsDstArg = NULL;
 	}
@@ -2124,3 +2780,20 @@ static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
 
 	return ret;
 }
+
+static int   reduxCleanupMsg               (redux_ctx*  ctx, int ret,
+                                            const char* fmt, ...){
+#if DEBUG
+	FILE* fp = stderr;
+	
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(fp, fmt, ap);
+	va_end(ap);
+	fflush(fp);
+#else
+	(void)fmt;
+#endif
+	
+	return reduxCleanup(ctx, ret);
+}
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index b4e919fcf9..94d2aac8ff 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -113,7 +113,7 @@ START_TEST(test_maxandargmax_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
@@ -205,7 +205,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -294,7 +294,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -393,7 +393,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
@@ -481,7 +481,7 @@ START_TEST(test_minandargmin_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
@@ -570,7 +570,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
@@ -669,7 +669,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
@@ -754,7 +754,7 @@ START_TEST(test_argmax_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
 
@@ -836,7 +836,7 @@ START_TEST(test_argmax_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
 
@@ -929,7 +929,7 @@ START_TEST(test_argmax_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
 
@@ -1011,7 +1011,7 @@ START_TEST(test_argmin_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
 
@@ -1093,7 +1093,7 @@ START_TEST(test_argmin_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
 
@@ -1186,7 +1186,7 @@ START_TEST(test_argmin_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
 
@@ -1265,7 +1265,7 @@ START_TEST(test_max_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 
@@ -1343,7 +1343,7 @@ START_TEST(test_max_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 
@@ -1431,7 +1431,7 @@ START_TEST(test_max_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 
@@ -1507,7 +1507,7 @@ START_TEST(test_min_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 
@@ -1585,7 +1585,7 @@ START_TEST(test_min_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 
@@ -1673,7 +1673,7 @@ START_TEST(test_min_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 
@@ -1750,7 +1750,7 @@ START_TEST(test_sum_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -1826,7 +1826,7 @@ START_TEST(test_sum_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -1912,7 +1912,7 @@ START_TEST(test_sum_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -1986,7 +1986,7 @@ START_TEST(test_prod_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2062,7 +2062,7 @@ START_TEST(test_prod_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2148,7 +2148,7 @@ START_TEST(test_prod_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2225,7 +2225,7 @@ START_TEST(test_prodnz_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2304,7 +2304,7 @@ START_TEST(test_prodnz_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2393,7 +2393,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2475,7 +2475,7 @@ START_TEST(test_and_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2559,7 +2559,7 @@ START_TEST(test_and_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2653,7 +2653,7 @@ START_TEST(test_and_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2735,7 +2735,7 @@ START_TEST(test_or_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2819,7 +2819,7 @@ START_TEST(test_or_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2913,7 +2913,7 @@ START_TEST(test_or_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2991,7 +2991,7 @@ START_TEST(test_xor_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3071,7 +3071,7 @@ START_TEST(test_xor_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3161,7 +3161,7 @@ START_TEST(test_xor_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3239,7 +3239,7 @@ START_TEST(test_any_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3319,7 +3319,7 @@ START_TEST(test_any_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3409,7 +3409,7 @@ START_TEST(test_any_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3487,7 +3487,7 @@ START_TEST(test_all_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3567,7 +3567,7 @@ START_TEST(test_all_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3657,7 +3657,7 @@ START_TEST(test_all_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));