diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h
index 966290be1d..bacf78ad64 100644
--- a/src/gpuarray/array.h
+++ b/src/gpuarray/array.h
@@ -123,6 +123,27 @@ typedef enum _ga_order {
   GA_F_ORDER=1
 } ga_order;
 
+/**
+ * Supported array reduction operations.
+ */
+
+typedef enum _ga_reduce_op {
+	GA_REDUCE_SUM,             /*        +        */
+	GA_REDUCE_PROD,            /*        *        */
+	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
+	GA_REDUCE_MIN,             /*      min()      */
+	GA_REDUCE_MAX,             /*      max()      */
+	GA_REDUCE_ARGMIN,          /*     argmin()    */
+	GA_REDUCE_ARGMAX,          /*     argmax()    */
+	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
+	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
+	GA_REDUCE_AND,             /*        &        */
+	GA_REDUCE_OR,              /*        |        */
+	GA_REDUCE_XOR,             /*        ^        */
+	GA_REDUCE_ALL,             /*     &&/all()    */
+	GA_REDUCE_ANY,             /*     ||/any()    */
+} ga_reduce_op;
+
 /**
  * Checks if all the specified flags are set.
  *
@@ -614,26 +635,31 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a);
 
 GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
 
+
 /**
- * @brief Computes simultaneously the maxima and the arguments of maxima over
- * specified axes of the tensor.
+ * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
+ *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
+ *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
  *
- * Returns two tensors of identical shape. Both tensors' axes are a subset of
- * the axes of the original tensor. The axes to be reduced are specified by
- * the caller, and the maxima and arguments of maxima are computed over them.
+ * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
+ * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
+ * source tensor. The axes to be reduced are specified by the caller, and the
+ * reduction is performed over these axes, which are then removed in the
+ * destination.
  *
- * @param [out] dstMax     The resulting tensor of maxima
- * @param [out] dstArgmax  the resulting tensor of arguments at maxima
+ * @param [out] dst        The destination tensor. Has the same type as the source.
+ * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
  * @param [in]  src        The source tensor.
  * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
  *                         <= src->nd.
  * @param [in]  reduxList  A list of integers of length reduxLen, indicating
  *                         the axes to be reduced. The order of the axes
- *                         matters for dstArgmax index calculations. All
- *                         entries in the list must be unique, >= 0 and
- *                         < src->nd.
+ *                         matters for dstArg index calculations (GpuArray_argmin,
+ *                         GpuArray_argmax, GpuArray_minandargmin,
+ *                         GpuArray_maxandargmax). All entries in the list must be
+ *                         unique, >= 0 and < src->nd.
  *                         
- *                         For example, if a 5D-tensor is reduced with an axis
+ *                         For example, if a 5D-tensor is max-reduced with an axis
  *                         list of [3,4,1], then reduxLen shall be 3, and the
  *                         index calculation in every point shall take the form
  *                         
@@ -647,11 +673,74 @@ GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
  *         code otherwise.
  */
 
-GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dstMax,
-                                          GpuArray*       dstArgmax,
+GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
                                           const GpuArray* src,
                                           unsigned        reduxLen,
                                           const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
+                                          GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+
+
+
+
 
 #ifdef __cplusplus
 }
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 12eedb24a9..d1253c9873 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -21,106 +21,613 @@
 #include "util/integerfactoring.h"
 
 
+/* Defines */
+#define  MAX_HW_DIMS                   3
+
+
+
 /* Datatypes */
-struct maxandargmax_ctx{
+
+/**
+ *                    Reduction Kernel Generator.
+ * 
+ * The generator produces a kernel from one of two "code models":
+ *   - Large
+ *   - Small
+ * Which one is used depends on the size of the destination tensor and the
+ * number of reductions for each destination element. A destination tensor
+ * with more than SMALL_REDUX_THRESHOLD elements or more elements than
+ * reductions for each element will result in use of the large code model;
+ * Otherwise the small code model is used.
+ * 
+ * 
+ *                         LARGE CODE MODEL:
+ * 
+ * In the large code model, each destination element is processed by a
+ * single thread.
+ * 
+ * Each thread begins with an initial value in a register, reads from all
+ * source elements contributing to the reduction, computes the result and
+ * writes it to the destination element.
+ * 
+ * A single kernel is generated that performs prescalar transformations, the
+ * reduction itself, postscalar transformations and the write to global memory.
+ * 
+ * 
+ *                         SMALL CODE MODEL:
+ * 
+ * In the small code model, each destination element is processed by
+ * multiple threads.
+ * 
+ * The destination tensor is first initialized with the initial value. Then,
+ * one several threads cooperate to perform the reduction atomically on each
+ * destination element. Lastly, postscalar transformations are applied
+ * in-place.
+ * 
+ * Two or three kernels are generated: The initialization kernel, the main
+ * kernel that performs prescalar transformations and the reduction itself, and
+ * possibly also a postscalar transformation kernel when it is required.
+ * 
+ * 
+ *                           Kernel Template:
+ * 
+ * The following kernel code template displays the code generated for the
+ * small code model. For the large code model, no pre/postRedux() kernels
+ * are generated (since their functionality is incorporated within the main
+ * redux() kernel), no atomicRedux() function needs to be generated because
+ * writes to global memory are unconditional and not contended.
+ * 
+ * 
+ *     //Includes
+ *     #include <limits.h>
+ *     #include <math.h>
+ *     #include <stdint.h>
+ *     
+ *     
+ *     //Typedefs:
+ *     typedef  float    T
+ *     typedef  int64_t  X
+ *     
+ *     
+ *     //Initializer (in case initial T cannot be expressed as a literal)
+ *     static T    getInitVal(void){
+ *         return ...
+ *     }
+ *     
+ *     
+ *     //Reduce into global memory destination a value.
+ *     static void atomicRedux(GLOBAL_MEM T* dst, T val){
+ *         ...
+ *     }
+ *     
+ *     
+ *     //Load data from source and apply pre-operations.
+ *     static T loadVal(X i0, X i1, ..., X iN,
+ *                      const GLOBAL_MEM T* src,
+ *                      const GLOBAL_MEM X* srcSteps,
+ *                      ...?){
+ *         return ...
+ *     }
+ *     
+ *     
+ *     //Initialization kernel,
+ *     KERNEL void preRedux(const GLOBAL_MEM X*        srcSize,
+ *                          const GLOBAL_MEM X*        chunkSize,
+ *                          GLOBAL_MEM T*              dst,
+ *                          const X                    dstOff,
+ *                          const GLOBAL_MEM X*        dstSteps){
+ *         //OFFSETS
+ *         dst += dstOff;
+ *         
+ *         //Initialize
+ *         dst[...] = getInitVal();
+ *     }
+ *     
+ *     
+ *     //Reduction Kernel.
+ *     KERNEL void redux(const GLOBAL_MEM T*        src,
+ *                       const X                    srcOff,
+ *                       const GLOBAL_MEM X*        srcSteps,
+ *                       const GLOBAL_MEM X*        srcSize,
+ *                       const GLOBAL_MEM X*        chunkSize,
+ *                       GLOBAL_MEM T*              dst,
+ *                       const X                    dstOff,
+ *                       const GLOBAL_MEM X*        dstSteps,
+ *                       GLOBAL_MEM X*              dstArg,
+ *                       const X                    dstArgOff,
+ *                       const GLOBAL_MEM X*        dstArgSteps){
+ *         //OFFSETS
+ *         src    += srcOff
+ *         dst    += dstOff
+ *         dstArg += dstArgOff
+ *         
+ *         //Declare Indices
+ *         //Compute Ranges
+ *         
+ *         //Define macros
+ *         //Outer Loops
+ *            //Inner Loops
+ *         //Undefine macros
+ *     }
+ *     
+ *     
+ *     //Post-scalar kernel,
+ *     KERNEL void postRedux(const GLOBAL_MEM X*        srcSize,
+ *                           const GLOBAL_MEM X*        chunkSize,
+ *                           GLOBAL_MEM T*              dst,
+ *                           const X                    dstOff,
+ *                           const GLOBAL_MEM X*        dstSteps){
+ *         //OFFSETS
+ *         dst += dstOff;
+ *         
+ *         //Initialize
+ *         dst[...] = getInitVal();
+ *     }
+ * 
+ * 
+ *                           Initial Reduction Values
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | Type\Op      |  +  |  *  |   max   |   min   |  &  |  |  |  ^  | &&  | ||  |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | signed   int |  0  |  1  | INT_MIN | INT_MAX | ~0  |  0  |  0  | ~0  |  0  |
+ * | unsigned int |  0  |  1  |    0    |   ~0    | ~0  |  0  |  0  | ~0  |  0  |
+ * | floating     | 0.0 | 1.0 |   NAN   |   NAN   |     |     |     |     |     |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ */
+
+struct redux_ctx{
 	/* Function Arguments. */
-	GpuArray*       dstMax;
-	GpuArray*       dstArgmax;
+	ga_reduce_op    op;
+	GpuArray*       dst;
+	GpuArray*       dstArg;
 	const GpuArray* src;
 	int             reduxLen;
 	const int*      reduxList;
 
 	/* General. */
-	int             ret;
 	int*            axisList;
 	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
-	const char*     dstMaxType;
-	const char*     dstArgmaxType;
+	int             srcTypeCode;
+	int             dstTypeCode;
+	int             dstArgTypeCode;
+	int             idxTypeCode;
+	int             accTypeCode;
+	const char*     srcTypeStr;
+	const char*     dstTypeStr;
+	const char*     dstArgTypeStr;
+	const char*     idxTypeStr;
+	const char*     accTypeStr;
+	const char*     initVal;
 	int             ndd;
 	int             ndr;
 	int             nds;
 	int             ndh;
+	int             ndhd;
+	int             ndhr;
+	int             largeCodeModel;
 	strb            s;
 	char*           sourceCode;
+	GpuKernel       preKernel;
 	GpuKernel       kernel;
+	GpuKernel       postKernel;
 
 	/* Scheduler */
-	int             hwAxisList[3];
-	size_t          blockSize [3];
-	size_t          gridSize  [3];
-	size_t          chunkSize [3];
+	int             hwAxisList[MAX_HW_DIMS];
+	size_t          blockSize [MAX_HW_DIMS];
+	size_t          gridSize  [MAX_HW_DIMS];
+	size_t          chunkSize [MAX_HW_DIMS];
 
 	/* Invoker */
 	gpudata*        srcStepsGD;
 	gpudata*        srcSizeGD;
 	gpudata*        chunkSizeGD;
-	gpudata*        dstMaxStepsGD;
-	gpudata*        dstArgmaxStepsGD;
+	gpudata*        dstStepsGD;
+	gpudata*        dstArgStepsGD;
 };
-typedef struct maxandargmax_ctx maxandargmax_ctx;
+typedef struct redux_ctx redux_ctx;
 
 
 
 /* Function prototypes */
-static int   axisInSet                          (int                v,
-                                                 const int*         set,
-                                                 size_t             setLen,
-                                                 size_t*            where);
-static void  appendIdxes                        (strb*              s,
-                                                 const char*        prologue,
-                                                 const char*        prefix,
-                                                 int                startIdx,
-                                                 int                endIdx,
-                                                 const char*        suffix,
-                                                 const char*        epilogue);
-static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx);
-static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx);
-static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx);
-static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx);
-static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx);
-static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx);
-static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx);
-static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx);
+static int   reduxGetSumInit               (int typecode, const char** property);
+static int   reduxGetProdInit              (int typecode, const char** property);
+static int   reduxGetMinInit               (int typecode, const char** property);
+static int   reduxGetMaxInit               (int typecode, const char** property);
+static int   reduxGetAndInit               (int typecode, const char** property);
+static int   reduxGetOrInit                (int typecode, const char** property);
+static int   axisInSet                     (int                v,
+                                            const int*         set,
+                                            size_t             setLen,
+                                            size_t*            where);
+static void  appendIdxes                   (strb*              s,
+                                            const char*        prologue,
+                                            const char*        prefix,
+                                            int                startIdx,
+                                            int                endIdx,
+                                            const char*        suffix,
+                                            const char*        epilogue);
+static int   reduxCheckargs                (redux_ctx*  ctx);
+static void  reduxSelectTypes              (redux_ctx*  ctx);
+static int   reduxSelectModel              (redux_ctx*  ctx);
+static int   reduxIsSmallCodeModel         (redux_ctx*  ctx);
+static int   reduxIsLargeCodeModel         (redux_ctx*  ctx);
+static int   reduxHasDst                   (redux_ctx*  ctx);
+static int   reduxHasDstArg                (redux_ctx*  ctx);
+static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
+static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis);
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis);
+static int   reduxSelectHwAxes             (redux_ctx*  ctx);
+static int   reduxComputeAxisList          (redux_ctx*  ctx);
+static int   reduxGenSource                (redux_ctx*  ctx);
+static void  reduxAppendSource             (redux_ctx*  ctx);
+static void  reduxAppendIncludes           (redux_ctx*  ctx);
+static void  reduxAppendTypedefs           (redux_ctx*  ctx);
+static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx);
+static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx);
+static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx);
+static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx);
+static void  reduxAppendFuncKernel         (redux_ctx*  ctx);
+static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx);
+static void  reduxAppendPrototype          (redux_ctx*  ctx);
+static void  reduxAppendOffsets            (redux_ctx*  ctx);
+static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx);
+static void  reduxAppendRangeCalculations  (redux_ctx*  ctx);
+static void  reduxAppendLoops              (redux_ctx*  ctx);
+static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx);
+static void  reduxAppendLoopOuter          (redux_ctx*  ctx);
+static void  reduxAppendLoopInner          (redux_ctx*  ctx);
+static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx);
+static int   reduxCompileLarge             (redux_ctx*  ctx);
+static int   reduxCompileSmall             (redux_ctx*  ctx);
+static int   reduxScheduleLarge            (redux_ctx*  ctx);
+static int   reduxInvokeLarge              (redux_ctx*  ctx);
+static int   reduxCleanup                  (redux_ctx*  ctx, int ret);
 
 
 /* Function implementation */
-GPUARRAY_PUBLIC int GpuArray_maxandargmax       (GpuArray*       dstMax,
-                                                 GpuArray*       dstArgmax,
-                                                 const GpuArray* src,
-                                                 unsigned        reduxLen,
-                                                 const unsigned* reduxList){
-	maxandargmax_ctx  ctxSTACK = {0};
-	maxandargmax_ctx  *ctx = &ctxSTACK;
-
-  ctxSTACK.dstMax = dstMax;
-	ctxSTACK.dstArgmax = dstArgmax;
-	ctxSTACK.src = src;
-	ctxSTACK.reduxLen = (int)reduxLen;
-	ctxSTACK.reduxList = (const int*)reduxList;
-
-	if(maxandargmaxCheckargs   (ctx) == GA_NO_ERROR &&
-	   maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR &&
-	   maxandargmaxGenSource   (ctx) == GA_NO_ERROR &&
-	   maxandargmaxCompile     (ctx) == GA_NO_ERROR &&
-	   maxandargmaxSchedule    (ctx) == GA_NO_ERROR &&
-	   maxandargmaxInvoke      (ctx) == GA_NO_ERROR){
-		return maxandargmaxCleanup(ctx);
-	}else{
-		return maxandargmaxCleanup(ctx);
+GPUARRAY_PUBLIC int  GpuArray_sum         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_SUM,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_prod        (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_PROD,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_prodnz      (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_PRODNZ,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_min         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MIN,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_max         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MAX,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_argmin      (GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ARGMIN,
+	                          NULL, dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_argmax      (GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ARGMAX,
+	                          NULL, dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_minandargmin(GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MINANDARGMIN,
+	                          dst,  dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_maxandargmax(GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX,
+	                          dst,  dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_and         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_AND,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_or          (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_OR,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_xor         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_XOR,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_all         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ALL,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_any         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ANY,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
+                                           GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	redux_ctx  ctxSTACK = {op, dst, dstArg, src,
+	                       (int)reduxLen, (const int*)reduxList},
+	          *ctx      = &ctxSTACK;
+
+	return reduxCheckargs(ctx);
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a sum-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetSumInit               (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "0";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a prod-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetProdInit              (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "1";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a max-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetMinInit               (int typecode, const char** property){
+	switch(typecode){
+		case GA_BYTE2:
+		case GA_BYTE3:
+		case GA_BYTE4:
+		case GA_BYTE8:
+		case GA_BYTE16:
+		case GA_BYTE:               *property = "SCHAR_MIN"; break;
+		case GA_SHORT2:
+		case GA_SHORT3:
+		case GA_SHORT4:
+		case GA_SHORT8:
+		case GA_SHORT16:
+		case GA_SHORT:              *property = "SHRT_MIN"; break;
+		case GA_INT2:
+		case GA_INT3:
+		case GA_INT4:
+		case GA_INT8:
+		case GA_INT16:
+		case GA_INT:                *property = "INT_MIN"; break;
+		case GA_LONG2:
+		case GA_LONG3:
+		case GA_LONG4:
+		case GA_LONG8:
+		case GA_LONG16:
+		case GA_LONG:               *property = "LONG_MIN"; break;
+		case GA_LONGLONG:           *property = "LLONG_MIN"; break;
+		case GA_BOOL:
+		case GA_UBYTE2:
+		case GA_UBYTE3:
+		case GA_UBYTE4:
+		case GA_UBYTE8:
+		case GA_UBYTE16:
+		case GA_UBYTE:
+		case GA_USHORT2:
+		case GA_USHORT3:
+		case GA_USHORT4:
+		case GA_USHORT8:
+		case GA_USHORT16:
+		case GA_USHORT:
+		case GA_UINT2:
+		case GA_UINT3:
+		case GA_UINT4:
+		case GA_UINT8:
+		case GA_UINT16:
+		case GA_UINT:
+		case GA_ULONG2:
+		case GA_ULONG3:
+		case GA_ULONG4:
+		case GA_ULONG8:
+		case GA_ULONG16:
+		case GA_ULONG:
+		case GA_ULONGLONG:
+		case GA_SIZE:               *property = "0"; break;
+		case GA_HALF:
+		case GA_FLOAT:
+		case GA_DOUBLE:
+		case GA_QUAD:               *property = "NAN"; break;
+		default:      return GA_UNSUPPORTED_ERROR;
+	}
+	
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a min-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetMaxInit               (int typecode, const char** property){
+	switch(typecode){
+		case GA_BOOL:               *property = "1"; break;
+		case GA_BYTE2:
+		case GA_BYTE3:
+		case GA_BYTE4:
+		case GA_BYTE8:
+		case GA_BYTE16:
+		case GA_BYTE:               *property = "SCHAR_MAX"; break;
+		case GA_UBYTE2:
+		case GA_UBYTE3:
+		case GA_UBYTE4:
+		case GA_UBYTE8:
+		case GA_UBYTE16:
+		case GA_UBYTE:              *property = "UCHAR_MAX"; break;
+		case GA_SHORT2:
+		case GA_SHORT3:
+		case GA_SHORT4:
+		case GA_SHORT8:
+		case GA_SHORT16:
+		case GA_SHORT:              *property = "SHRT_MAX"; break;
+		case GA_USHORT2:
+		case GA_USHORT3:
+		case GA_USHORT4:
+		case GA_USHORT8:
+		case GA_USHORT16:
+		case GA_USHORT:             *property = "USHRT_MAX"; break;
+		case GA_INT2:
+		case GA_INT3:
+		case GA_INT4:
+		case GA_INT8:
+		case GA_INT16:
+		case GA_INT:                *property = "INT_MAX"; break;
+		case GA_UINT2:
+		case GA_UINT3:
+		case GA_UINT4:
+		case GA_UINT8:
+		case GA_UINT16:
+		case GA_UINT:               *property = "UINT_MAX"; break;
+		case GA_LONG2:
+		case GA_LONG3:
+		case GA_LONG4:
+		case GA_LONG8:
+		case GA_LONG16:
+		case GA_LONG:               *property = "LONG_MAX"; break;
+		case GA_ULONG2:
+		case GA_ULONG3:
+		case GA_ULONG4:
+		case GA_ULONG8:
+		case GA_ULONG16:
+		case GA_ULONG:              *property = "ULONG_MAX"; break;
+		case GA_LONGLONG:           *property = "LLONG_MAX"; break;
+		case GA_ULONGLONG:          *property = "ULLONG_MAX"; break;
+		case GA_HALF:
+		case GA_FLOAT:
+		case GA_DOUBLE:
+		case GA_QUAD:               *property = "NAN"; break;
+		default:      return GA_UNSUPPORTED_ERROR;
+	}
+	
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a and-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetAndInit               (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "~0";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a or-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetOrInit                (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
 	}
+	*property = "0";
+	return GA_NO_ERROR;
 }
 
 /**
@@ -133,10 +640,10 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax       (GpuArray*       dstMax,
  * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
  */
 
-static int   axisInSet                          (int                v,
-                                                 const int*         set,
-                                                 size_t             setLen,
-                                                 size_t*            where){
+static int   axisInSet                     (int                v,
+                                            const int*         set,
+                                            size_t             setLen,
+                                            size_t*            where){
 	size_t i;
 
 	for(i=0;i<setLen;i++){
@@ -165,13 +672,13 @@ static int   axisInSet                          (int                v,
  * @param [in]  epilogue  Text that is appended and NOT repeated.
  */
 
-static void  appendIdxes                        (strb*              s,
-                                                 const char*        prologue,
-                                                 const char*        prefix,
-                                                 int                startIdx,
-                                                 int                endIdx,
-                                                 const char*        suffix,
-                                                 const char*        epilogue){
+static void  appendIdxes                   (strb*              s,
+                                            const char*        prologue,
+                                            const char*        prefix,
+                                            int                startIdx,
+                                            int                endIdx,
+                                            const char*        suffix,
+                                            const char*        epilogue){
 	int i;
 
 	prologue = prologue ? prologue : "";
@@ -188,63 +695,101 @@ static void  appendIdxes                        (strb*              s,
 
 /**
  * @brief Check the sanity of the arguments, in agreement with the
- *        documentation for GpuArray_maxandargmax().
+ *        documentation for GpuArray_reduction().
  *
  *        Also initialize certain parts of the context.
  *
  * @return GA_INVALID_ERROR if arguments invalid; GA_NO_ERROR otherwise.
  */
 
-static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx){
-	int i;
+static int   reduxCheckargs                (redux_ctx*  ctx){
+	int i, ret;
+	const strb INIT_STRB = STRB_STATIC_INIT;
 
 	/**
 	 * We initialize certain parts of the context.
 	 */
 
-	ctx->ret           = GA_NO_ERROR;
 	ctx->axisList      = NULL;
 	ctx->gpuCtx        = NULL;
 
-	ctx->dstMaxType    = ctx->dstArgmaxType = NULL;
+	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
+	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
+	ctx->initVal       = NULL;
 	ctx->ndh           = 0;
+	ctx->ndhd          = 0;
+	ctx->ndhr          = 0;
 	ctx->sourceCode    = NULL;
+	ctx->s             = INIT_STRB;
 
-	ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0;
-	ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1;
-	ctx->gridSize  [0] = ctx->gridSize  [1] = ctx->gridSize  [2] = 1;
-	ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1;
+	for(i=0;i<MAX_HW_DIMS;i++){
+		ctx->hwAxisList[i] = 0;
+		ctx->blockSize [i] = 1;
+		ctx->gridSize  [i] = 1;
+		ctx->chunkSize [i] = 1;
+	}
 
-	ctx->srcStepsGD    = ctx->srcSizeGD     = ctx->chunkSizeGD   =
-	ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL;
+	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
+	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
+	/* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */
 
 
-	/* Insane src or reduxLen? */
-	if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 ||
-	    ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){
-		return ctx->ret=GA_INVALID_ERROR;
+	/* Insane src, reduxLen, dst or dstArg? */
+	if(!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 ||
+	   ctx->reduxLen > (int)ctx->src->nd){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+	if((reduxHasDst   (ctx) && !ctx->dst)   ||
+	   (reduxHasDstArg(ctx) && !ctx->dstArg)){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
+
 	/* Insane or duplicate list entry? */
 	for(i=0;i<ctx->reduxLen;i++){
 		if(ctx->reduxList[i] <  0                            ||
 		   ctx->reduxList[i] >= (int)ctx->src->nd            ||
 		   axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
-			return ctx->ret=GA_INVALID_ERROR;
+			return reduxCleanup(ctx, GA_INVALID_ERROR);
 		}
 	}
 
-	/* Unknown type? */
-	ctx->dstMaxType    = gpuarray_get_type(ctx->src->typecode)->cluda_name;
-	ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE)          ->cluda_name;
-	if(!ctx->dstMaxType || !ctx->dstArgmaxType){
-		return ctx->ret=GA_INVALID_ERROR;
-	}
 
 	/* GPU context non-existent? */
-	ctx->gpuCtx        = GpuArray_context(ctx->src);
+	ctx->gpuCtx     = GpuArray_context(ctx->src);
 	if(!ctx->gpuCtx){
-		return ctx->ret=GA_INVALID_ERROR;
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+
+
+	/* Unknown type? */
+	reduxSelectTypes(ctx);
+	if(!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
+	   !ctx->accTypeStr){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+
+
+	/* Determine initializer, and error out if reduction unsupported. */
+	switch(ctx->op){
+		case GA_REDUCE_SUM:  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_PRODNZ:
+		case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MIN:  ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAX:  ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_ALL:
+		case GA_REDUCE_AND:  ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_ANY:
+		case GA_REDUCE_XOR:
+		case GA_REDUCE_OR:   ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal); break;
+		default:             ret = GA_UNSUPPORTED_ERROR; break;
+	}
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
 	}
 
 
@@ -256,113 +801,492 @@ static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx){
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
+	strb_ensure(&ctx->s, 5*1024);
+
 
-	return ctx->ret;
+
+	return reduxSelectModel(ctx);
 }
 
 /**
- * @brief Select which axes (up to 3) will be assigned to hardware
- *        dimensions.
+ * @brief Select types for the reduction kernel's implementation.
+ * 
+ * There are 5 types of relevance:
+ *   - Source                   (S=Source)
+ *   - Destination              (T=Target)
+ *   - Destination Argument     (A=Arg)
+ *   - Index                    (X=indeX)
+ *   - Accumulator              (K=aKKumulator/reduction)
  */
 
-static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx){
-	int    i, j, maxI = 0;
-	size_t maxV;
+static void  reduxSelectTypes              (redux_ctx*  ctx){
+	/* Deal with the various typecodes. */
+	ctx->srcTypeCode    = ctx->src->typecode;
+	ctx->dstTypeCode    = ctx->srcTypeCode;
+	ctx->dstArgTypeCode = GA_SSIZE;
+	ctx->idxTypeCode    = GA_SSIZE;
+	switch(ctx->srcTypeCode){
+		case GA_HALF:   ctx->accTypeCode = GA_FLOAT;
+		case GA_HALF2:  ctx->accTypeCode = GA_FLOAT2;
+		case GA_HALF4:  ctx->accTypeCode = GA_FLOAT4;
+		case GA_HALF8:  ctx->accTypeCode = GA_FLOAT8;
+		case GA_HALF16: ctx->accTypeCode = GA_FLOAT16;
+		default:        ctx->accTypeCode = ctx->srcTypeCode;
+	}
+	
+	/* Get the string version as well. */
+	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
+	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
+	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
+	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
+	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
+}
+
+/**
+ * @brief Select which code model will be used:
+ * 
+ *        - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
+ *                 destination tensor size >= # of reductions per destination
+ *                 tensor element):
+ *            All destination elements have their own thread.
+ *        - Small (otherwise):
+ *            Multiple threads cooperate on a single destination element.
+ */
+
+static int   reduxSelectModel              (redux_ctx*  ctx){
+	int      i, ret;
+	unsigned numProcs;
+	size_t   localSize;
+	size_t   dstNumElem = 1, reduxPerElem = 1;
 
-	ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3;
 
 	/**
-	 * The ctx->hwAxisLen largest axes are selected and assigned in
-	 * descending order to X, Y, Z.
+	 * Query device for approximate total level of parallelism. If destination
+	 * tensor is so big it can keep all threads busy on individual elements,
+	 * use large code model; Otherwise use small code model, where threads will
+	 * have to cooperate.
 	 */
+	
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
 
-	for(i=0;i<ctx->ndh;i++){
-		maxV = 0;
-
-		for(j=0;j<ctx->nds;j++){
-			if(!axisInSet(j, ctx->hwAxisList, i,        0) &&
-			   !axisInSet(j, ctx->reduxList,  ctx->ndr, 0) &&
-			   ctx->src->dimensions[j] >= maxV){
-				maxV = ctx->src->dimensions[j];
-				maxI = j;
-			}
+
+	/**
+	 * Compute #elems in dst and # reductions per dst element.
+	 */
+
+	for(i=0;i<ctx->nds;i++){
+		if(axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
+			reduxPerElem *= ctx->src->dimensions[i];
+		}else{
+			dstNumElem   *= ctx->src->dimensions[i];
 		}
+	}
+	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
+	                      dstNumElem >= reduxPerElem
+	                      || 1;/* BUG: Erase when small code model implemented. */
+	/**
+	 * *** IT IS NOW SAFE TO CALL: ***
+	 *       - reduxIsLargeModel()
+	 *       - reduxIsSmallModel()
+	 *       - reduxKernelRequiresDst()
+	 *       - reduxKernelRequiresDstArg()
+	 */
+	
+	
+	return reduxSelectHwAxes(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the small code model or not.
+ */
 
-		ctx->hwAxisList[i] = maxI;
+static int   reduxIsSmallCodeModel         (redux_ctx*  ctx){
+	return !reduxIsLargeCodeModel(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the large code model or not.
+ */
+
+static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
+	return ctx->largeCodeModel;
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dst argument.
+ */
+
+static int   reduxHasDst                   (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return 0;
+		default:                     return 1;
 	}
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dstArg argument.
+ */
 
-	return ctx->ret=GA_NO_ERROR;
+static int   reduxHasDstArg                (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return 1;
+		default:                     return 0;
+	}
 }
 
 /**
- * @brief Generate the kernel code for MaxAndArgmax.
- *
- * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ * @brief Returns whether the generated kernel internally requires a dst
+ *        argument.
+ * 
+ * This is semantically subtly different from reduxHasDst(). The main
+ * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
+ * reductions; Either *might* require a dst buffer, which will have to be
+ * allocated, even though it will be discared.
  */
 
-static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx){
-	/* Compute internal axis remapping. */
+static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return reduxIsSmallCodeModel(ctx);
+		default:                     return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dstArg
+ *        argument.
+ * 
+ * This is semantically subtly different from reduxHasDstArg(), since it asks
+ * whether the reduction, even though it does not accept a dstArg argument,
+ * still requires a dstArg internally.
+ */
+
+static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
+	/**
+	 * At present there exists no reduction whose implementation requires
+	 * a dstArg but whose interface does not.
+	 * 
+	 * E.g. the max() and min() reductions do NOT currently require a temporary
+	 *      buffer for indexes, and will not in the foreseeable future.
+	 */
+	
+	return reduxHasDstArg(ctx);
+}
+
+/**
+ * @brief Check whether we can add another reduction axis
+ *        (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to
+ *        the hardware axis list.
+ */
+
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis){
+	if(ctx->ndh >= MAX_HW_DIMS){
+		return 0;
+	}else{
+		return wantReductionAxis ? ctx->ndhr < ctx->ndr:
+		                           ctx->ndhd < ctx->ndd;
+	}
+}
+
+/**
+ * @brief Append the largest reduction axis (wantReductionAxis=1) or
+ *        destination axis (wantReductionAxis=0) that isn't yet in the hardware
+ *        axis list into said hardware axis list.
+ */
+
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){
+	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
+	size_t maxV = 0;
+	
+	/* Find */
+	for(i=0;i<ctx->nds;i++){
+		isInHwList      = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0);
+		isInReduxList   = axisInSet(i, ctx->reduxList,  ctx->ndr, 0);
+		isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList;
+		isLargestSoFar  = ctx->src->dimensions[i] >= maxV;
+		
+		if(!isInHwList && isInDesiredList && isLargestSoFar){
+			maxV = ctx->src->dimensions[i];
+			maxI = i;
+		}
+	}
+	
+	/* Append */
+	ctx->hwAxisList[ctx->ndh++] = maxI;
+	if(wantReductionAxis){
+		ctx->ndhr++;
+	}else{
+		ctx->ndhd++;
+	}
+}
+
+/**
+ * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware
+ *        dimensions.
+ * 
+ * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor
+ *                             dimensions are selected.
+ * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest-
+ *                             to-smallest) are selected. If less than
+ *                             MAX_HW_DIMS dimensions were selected,
+ *                             destination tensor dimensions are selected until
+ *                             MAX_HW_DIMS total dimensions are selected, or no
+ *                             destination tensors are left.
+ */
+
+static int   reduxSelectHwAxes             (redux_ctx*  ctx){
+	if(reduxIsSmallCodeModel(ctx)){
+		while(reduxCanAppendHwAxis(ctx, 1)){
+			reduxAppendLargestAxisToHwList(ctx, 1);
+		}
+	}
+	
+	while(reduxCanAppendHwAxis(ctx, 0)){
+		reduxAppendLargestAxisToHwList(ctx, 0);
+	}
+	
+	return reduxComputeAxisList(ctx);
+}
+
+/**
+ * @brief Compute the axis list.
+ * 
+ * The axis list describes the mapping between the nested loops of the kernel
+ * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and
+ * the axes of the source tensor. The first axis in the list corresponds to the
+ * outermost loop and the last axis in the list to the innermost.
+ * 
+ * The first ctx->ndd axes correspond to the outer loops that iterate over
+ * each destination element. The last ctx->ndr axes correspond to the inner
+ * loops that iterate over the dimensions of elements that are to be reduced.
+ * 
+ * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns
+ *         GA_NO_ERROR.
+ */
+
+static int   reduxComputeAxisList          (redux_ctx*  ctx){
+	int i, f=0;
+	
 	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
 	if(!ctx->axisList){
-		return ctx->ret=GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-	maxandargmaxComputeAxisList(ctx);
 
-	/* Generate kernel proper. */
-	strb_ensure(&ctx->s, 5*1024);
-	maxandargmaxAppendKernel(ctx);
-	free(ctx->axisList);
-	ctx->axisList   = NULL;
+	for(i=0;i<ctx->nds;i++){
+		if(!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
+			ctx->axisList[f++] = i;
+		}
+	}
+	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
+	
+	
+	return reduxGenSource(ctx);
+}
+
+/**
+ * @brief Generate the kernel code for the reduction.
+ *
+ * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ */
+
+static int   reduxGenSource                (redux_ctx*  ctx){
+	reduxAppendSource(ctx);
 	ctx->sourceCode = strb_cstr(&ctx->s);
 	if(!ctx->sourceCode){
-		return ctx->ret=GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-
-	/* Return it. */
-	return ctx->ret=GA_NO_ERROR;
+	
+	return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx):
+	                                    reduxCompileSmall(ctx);
 }
-static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx){
-	maxandargmaxAppendTypedefs         (ctx);
-	maxandargmaxAppendPrototype        (ctx);
-	strb_appends           (&ctx->s, "{\n");
-	maxandargmaxAppendOffsets          (ctx);
-	maxandargmaxAppendIndexDeclarations(ctx);
-	maxandargmaxAppendRangeCalculations(ctx);
-	maxandargmaxAppendLoops            (ctx);
-	strb_appends           (&ctx->s, "}\n");
+static void  reduxAppendSource             (redux_ctx*  ctx){
+	reduxAppendIncludes         (ctx);
+	reduxAppendTypedefs         (ctx);
+	reduxAppendFuncGetInitVal   (ctx);
+	reduxAppendFuncLoadVal      (ctx);
+	reduxAppendFuncReduxVal     (ctx);
+	reduxAppendFuncPreKernel    (ctx);
+	reduxAppendFuncKernel       (ctx);
+	reduxAppendFuncPostKernel   (ctx);
 }
-static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx){
+static void  reduxAppendIncludes           (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/* Includes */\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendTypedefs           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/* Typedefs */\n");
-	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the array being processed. */\n", ctx->dstMaxType);
-	strb_appendf(&ctx->s, "typedef %s     X;/* Index type: signed 32/64-bit. */\n",          ctx->dstArgmaxType);
+	strb_appendf(&ctx->s, "typedef %s     S;/* The type of the source array. */\n",                ctx->srcTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the destination array. */\n",           ctx->dstTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     A;/* The type of the destination argument array. */\n",  ctx->dstArgTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     X;/* The type of the indices: signed 32/64-bit. */\n",   ctx->idxTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     K;/* The type of the accumulator variable. */\n",        ctx->accTypeStr);
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Initial value function.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n");
+	strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal);
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
+	int i;
+	
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Multidimensional source element loader.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	appendIdxes (&ctx->s, "WITHIN_KERNEL K    loadVal(", "X i", 0, ctx->nds, "", "");
+	if(ctx->nds > 0){
+		strb_appends(&ctx->s, ", ");
+	}
+	strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n");
+	strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + ");
+	for(i=0;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
+	}
+	strb_appends(&ctx->s, "0));\n");
+	strb_appends(&ctx->s, "\treturn v;\n");
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
+	int i, anyArgsEmitted = 0;
+	
+	/* Function Signature. */
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Global memory value reduction function.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Responsible for either:\n");
+	strb_appends(&ctx->s, " *   1) Safe writeback of final value to memory, or\n");
+	strb_appends(&ctx->s, " *   2) Safe atomic reduction of partial value into memory.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
+	anyArgsEmitted = ctx->ndd>0;
+	if(anyArgsEmitted){
+		strb_appends(&ctx->s, ", ");
+	}
+	if(reduxKernelRequiresDst   (ctx)){
+		anyArgsEmitted = 1;
+		strb_appends(&ctx->s, "GLOBAL_MEM T* dst,    const GLOBAL_MEM X* dstSteps,    K v");
+	}
+	if(anyArgsEmitted){
+		strb_appends(&ctx->s, ", ");
+	}
+	if(reduxKernelRequiresDstArg(ctx)){
+		anyArgsEmitted = 1;
+		strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i");
+	}
+	strb_appends(&ctx->s, "){\n");
+	
+	
+	/* Post-scalar transformations go here. */
+	
+	
+	/* Write to memory. */
+	if(reduxIsLargeCodeModel(ctx)){
+		/* Large code model. Easy: just write out the data, since it's safe. */
+		if(reduxKernelRequiresDst   (ctx)){
+			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + ");
+			for(i=0;i<ctx->ndd;i++){
+				strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t                                          ", i, i);
+			}
+			strb_appends(&ctx->s, "0)) = v;\n");
+		}
+		if(reduxKernelRequiresDstArg(ctx)){
+			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + ");
+			for(i=0;i<ctx->ndd;i++){
+				strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t                                             ", i, i);
+			}
+			strb_appends(&ctx->s, "0)) = i;\n");
+		}
+	}else{
+		/* BUG: Implement the atomic reduction, one or two CAS loops. */
+		if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+			
+		}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+			
+		}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+			
+		}
+	}
+	
+	/* Close off function. */
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
+	
+}
+static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
+	reduxAppendPrototype        (ctx);
+	strb_appends           (&ctx->s, "{\n");
+	reduxAppendOffsets          (ctx);
+	reduxAppendIndexDeclarations(ctx);
+	reduxAppendRangeCalculations(ctx);
+	reduxAppendLoops            (ctx);
+	strb_appends           (&ctx->s, "}\n");
+}
+static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx){
+	
+}
+static void  reduxAppendPrototype          (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Reduction Kernel.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Implements actual reduction operation.\n");
+	strb_appends(&ctx->s, " */\n");
 	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S*        src,\n");
+	strb_appends(&ctx->s, "                  const X                    srcOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSteps,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSize,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        chunkSize,\n");
+	strb_appends(&ctx->s, "                  GLOBAL_MEM T*              dst,\n");
+	strb_appends(&ctx->s, "                  const X                    dstOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstSteps,\n");
+	strb_appends(&ctx->s, "                  GLOBAL_MEM A*              dstArg,\n");
+	strb_appends(&ctx->s, "                  const X                    dstArgOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstArgSteps)");
 }
-static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx){
-	strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T*        src,\n");
-	strb_appends(&ctx->s, "                         const X         srcOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSteps,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSize,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        chunkSize,\n");
-	strb_appends(&ctx->s, "                         GLOBAL_MEM T*              dstMax,\n");
-	strb_appends(&ctx->s, "                         const X         dstMaxOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstMaxSteps,\n");
-	strb_appends(&ctx->s, "                         GLOBAL_MEM X*              dstArgmax,\n");
-	strb_appends(&ctx->s, "                         const X         dstArgmaxOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstArgmaxSteps)");
-}
-static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx){
+static void  reduxAppendOffsets            (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/* Add offsets */\n");
-	strb_appends(&ctx->s, "\tsrc       = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src       + srcOff);\n");
-	strb_appends(&ctx->s, "\tdstMax    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dstMax    + dstMaxOff);\n");
-	strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArgmax + dstArgmaxOff);\n");
+	strb_appends(&ctx->s, "\tsrc    = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src    + srcOff);\n");
+	strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
+	strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
+static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	int i;
-	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");
+	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n");
 
 	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
 	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
@@ -392,7 +1316,7 @@ static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
+static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	size_t hwDim;
 	int    i;
 
@@ -406,10 +1330,10 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
 	}
 	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dMStep   = dstMaxSteps[%d];\n", i, i);
+		strb_appendf(&ctx->s, "\ti%dMStep   = dstSteps[%d];\n", i, i);
 	}
 	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgmaxSteps[%d];\n", i, i);
+		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 	}
 	for(i=ctx->nds-1;i>=ctx->ndd;i--){
 		/**
@@ -425,7 +1349,7 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	}
 	for(i=0;i<ctx->nds;i++){
 		/**
-		 * Up to 3 dimensions get to rely on hardware loops.
+		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
@@ -437,7 +1361,7 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	}
 	for(i=0;i<ctx->nds;i++){
 		/**
-		 * Up to 3 dimensions get to rely on hardware loops.
+		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
@@ -451,17 +1375,17 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoops              (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/**\n");
 	strb_appends(&ctx->s, "\t * FREE LOOPS.\n");
 	strb_appends(&ctx->s, "\t */\n");
 	strb_appends(&ctx->s, "\t\n");
 
-	maxandargmaxAppendLoopMacroDefs  (ctx);
-	maxandargmaxAppendLoopOuter      (ctx);
-	maxandargmaxAppendLoopMacroUndefs(ctx);
+	reduxAppendLoopMacroDefs  (ctx);
+	reduxAppendLoopOuter      (ctx);
+	reduxAppendLoopMacroUndefs(ctx);
 }
-static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx){
 	int i;
 
 	/**
@@ -476,16 +1400,6 @@ static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
 
 	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
 
-	/**
-	 * SRCINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ")   (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + ");
-	for(i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n                                            ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
-
 	/**
 	 * RDXINDEXER Macro
 	 */
@@ -495,28 +1409,8 @@ static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
 		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
 	}
 	strb_appends(&ctx->s, "0)\n");
-
-	/**
-	 * DSTMINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + ");
-	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n                                                  ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
-
-	/**
-	 * DSTAINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + ");
-	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n                                                     ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
 }
-static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopOuter          (redux_ctx*  ctx){
 	int i;
 
 	/**
@@ -531,7 +1425,7 @@ static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
 	 * Inner Loop Generation
 	 */
 
-	maxandargmaxAppendLoopInner(ctx);
+	reduxAppendLoopInner(ctx);
 
 	/**
 	 * Outer Loop Trailer Generation
@@ -541,87 +1435,111 @@ static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
 		strb_appends(&ctx->s, "\t}\n");
 	}
 }
-static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	int i;
 
 	/**
 	 * Inner Loop Prologue
 	 */
 
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * Reduction initialization.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
-
-	appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", "");
-	if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");}
-	appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n");
-
-	appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n");
-
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * Reduction initialization.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n");
+	strb_appends(&ctx->s, "\t\tX argI = 0;\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
 
 	/**
 	 * Inner Loop Header Generation
 	 */
 
 	for(i=ctx->ndd;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+		strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
 	/**
 	 * Inner Loop Body Generation
 	 */
 
-	appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n");
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\tif(V > maxV){\n");
-	strb_appends(&ctx->s, "\t\tmaxV = V;\n");
-	appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-	strb_appends(&ctx->s, "\t}\n");
+	appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", "");
+	if(ctx->nds > 0){
+		strb_appends(&ctx->s, ", ");
+	}
+	strb_appends(&ctx->s, "src, srcSteps);\n");
+	strb_appends(&ctx->s, "\t\t\t\n");
+	switch(ctx->op){
+		case GA_REDUCE_SUM:          strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break;
+		case GA_REDUCE_PROD:         strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break;
+		case GA_REDUCE_PRODNZ:       strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break;
+		case GA_REDUCE_MIN:          strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n"); break;
+		case GA_REDUCE_MAX:          strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n"); break;
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MINANDARGMIN:
+			strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
+			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+			strb_appends(&ctx->s, "\t\t\t}\n");
+		break;
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAXANDARGMAX:
+			strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
+			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+			strb_appends(&ctx->s, "\t\t\t}\n");
+		break;
+		case GA_REDUCE_AND:          strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); break;
+		case GA_REDUCE_OR:           strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); break;
+		case GA_REDUCE_XOR:          strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); break;
+		case GA_REDUCE_ALL:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV && v;\n"); break;
+		case GA_REDUCE_ANY:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV || v;\n"); break;
+	}
 
 	/**
 	 * Inner Loop Trailer Generation
 	 */
 
 	for(i=ctx->ndd;i<ctx->nds;i++){
-		strb_appends(&ctx->s, "\t}\n");
+		strb_appends(&ctx->s, "\t\t}\n");
 	}
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\t\n");
 
 	/**
 	 * Inner Loop Epilogue Generation
 	 */
 
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * Destination writeback.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
-	appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n");
-	appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * Destination writeback.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n");
+	}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n");
+	}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n");
+	}
 }
-static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "#undef FOROVER\n");
 	strb_appends(&ctx->s, "#undef ESCAPE\n");
-	strb_appends(&ctx->s, "#undef SRCINDEXER\n");
 	strb_appends(&ctx->s, "#undef RDXINDEXER\n");
-	strb_appends(&ctx->s, "#undef DSTMINDEXER\n");
-	strb_appends(&ctx->s, "#undef DSTAINDEXER\n");
-}
-static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx){
-	int i, f=0;
-
-	for(i=0;i<ctx->nds;i++){
-		if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
-			continue;
-		}
-		ctx->axisList[f++] = i;
-	}
-	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
 }
 
 /**
@@ -630,59 +1548,65 @@ static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx){
  * @return
  */
 
-static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx){
+static int   reduxCompileLarge             (redux_ctx*  ctx){
 	const int    ARG_TYPECODES[]   = {
 		GA_BUFFER, /* src */
 		GA_SIZE,   /* srcOff */
 		GA_BUFFER, /* srcSteps */
 		GA_BUFFER, /* srcSize */
 		GA_BUFFER, /* chnkSize */
-		GA_BUFFER, /* dstMax */
-		GA_SIZE,   /* dstMaxOff */
-		GA_BUFFER, /* dstMaxSteps */
-		GA_BUFFER, /* dstArgmax */
-		GA_SIZE,   /* dstArgmaxOff */
-		GA_BUFFER  /* dstArgmaxSteps */
+		GA_BUFFER, /* dst */
+		GA_SIZE,   /* dstOff */
+		GA_BUFFER, /* dstSteps */
+		GA_BUFFER, /* dstArg */
+		GA_SIZE,   /* dstArgOff */
+		GA_BUFFER  /* dstArgSteps */
 	};
 	const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
-	const char*  SRCS[1];
-
-	SRCS[0] = ctx->sourceCode;
-
-	ctx->ret = GpuKernel_init(&ctx->kernel,
+	const char*  SRCS[1]           = {ctx->sourceCode};
+	const size_t SRC_LENS[1]       = {strlen(ctx->sourceCode)};
+	const size_t SRCS_LEN          = sizeof(SRCS)/sizeof(*SRCS);
+	
+	int ret  = GpuKernel_init(&ctx->kernel,
 	                          ctx->gpuCtx,
-	                          1,
+	                          SRCS_LEN,
 	                          SRCS,
-	                          NULL,
-	                          "maxandargmax",
+	                          SRC_LENS,
+	                          "redux",
 	                          ARG_TYPECODES_LEN,
 	                          ARG_TYPECODES,
 	                          GA_USE_CLUDA,
 	                          (char**)0);
-	free(ctx->sourceCode);
-	ctx->sourceCode = NULL;
 
-	return ctx->ret;
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}else{
+		return reduxScheduleLarge(ctx);
+	}
+}
+static int   reduxCompileSmall             (redux_ctx*  ctx){
+	/* BUG: Implement small code model. */
+	return reduxCompileLarge(ctx);
 }
 
 /**
  * Compute a good thread block size / grid size / software chunk size for Nvidia.
  */
 
-static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx){
+static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	int            i;
 	size_t         warpMod;
 	size_t         bestWarpMod  = 1;
 	unsigned       bestWarpAxis = 0;
 	uint64_t       maxLg;
-	uint64_t       maxLs[3];
+	uint64_t       maxLs[MAX_HW_DIMS];
 	uint64_t       maxGg;
-	uint64_t       maxGs[3];
-	uint64_t       dims [3];
-	double         slack[3];
-	ga_factor_list factBS[3];
-	ga_factor_list factGS[3];
-	ga_factor_list factCS[3];
+	uint64_t       maxGs [MAX_HW_DIMS];
+	uint64_t       dims  [MAX_HW_DIMS];
+	double         slack [MAX_HW_DIMS];
+	ga_factor_list factBS[MAX_HW_DIMS];
+	ga_factor_list factGS[MAX_HW_DIMS];
+	ga_factor_list factCS[MAX_HW_DIMS];
 
 
 	/**
@@ -771,76 +1695,78 @@ static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx){
 	}
 
 	/* Return. */
-	return ctx->ret=GA_NO_ERROR;
+	return reduxInvokeLarge(ctx);
 }
 
 /**
  * Invoke the kernel.
  */
 
-static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
+static int   reduxInvokeLarge              (redux_ctx*  ctx){
 	void* args[11];
+	int   ret;
 
 	/**
 	 * Argument Marshalling. This the grossest gross thing in here.
 	 */
 
-	const int flags       = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
-	ctx->srcStepsGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
-	                                      ctx->src->strides,       flags, 0);
-	ctx->srcSizeGD        = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
-	                                      ctx->src->dimensions,    flags, 0);
-	ctx->chunkSizeGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
-	                                      ctx->chunkSize,          flags, 0);
-	ctx->dstMaxStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                      ctx->dstMax->strides,    flags, 0);
-	ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                      ctx->dstArgmax->strides, flags, 0);
+	const int flags    = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
+	ctx->srcStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
+	                                   ctx->src->strides,    flags, 0);
+	ctx->srcSizeGD     = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
+	                                   ctx->src->dimensions, flags, 0);
+	ctx->chunkSizeGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
+	                                   ctx->chunkSize,       flags, 0);
+	ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                   ctx->dst->strides,    flags, 0);
+	ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                   ctx->dstArg->strides, flags, 0);
 	args[ 0] = (void*) ctx->src->data;
 	args[ 1] = (void*)&ctx->src->offset;
 	args[ 2] = (void*) ctx->srcStepsGD;
 	args[ 3] = (void*) ctx->srcSizeGD;
 	args[ 4] = (void*) ctx->chunkSizeGD;
-	args[ 5] = (void*) ctx->dstMax->data;
-	args[ 6] = (void*)&ctx->dstMax->offset;
-	args[ 7] = (void*) ctx->dstMaxStepsGD;
-	args[ 8] = (void*) ctx->dstArgmax->data;
-	args[ 9] = (void*)&ctx->dstArgmax->offset;
-	args[10] = (void*) ctx->dstArgmaxStepsGD;
-
-	if(ctx->srcStepsGD      &&
-	   ctx->srcSizeGD       &&
-	   ctx->chunkSizeGD     &&
-	   ctx->dstMaxStepsGD   &&
-	   ctx->dstArgmaxStepsGD){
-		ctx->ret = GpuKernel_call(&ctx->kernel,
-		                          ctx->ndh>0 ? ctx->ndh : 1,
-		                          ctx->gridSize,
-		                          ctx->blockSize,
-		                          0,
-		                          args);
+	args[ 5] = (void*) ctx->dst->data;
+	args[ 6] = (void*)&ctx->dst->offset;
+	args[ 7] = (void*) ctx->dstStepsGD;
+	args[ 8] = (void*) ctx->dstArg->data;
+	args[ 9] = (void*)&ctx->dstArg->offset;
+	args[10] = (void*) ctx->dstArgStepsGD;
+
+	if(ctx->srcStepsGD   &&
+	   ctx->srcSizeGD    &&
+	   ctx->chunkSizeGD  &&
+	   ctx->dstStepsGD   &&
+	   ctx->dstArgStepsGD){
+		ret = GpuKernel_call(&ctx->kernel,
+		                     ctx->ndh>0 ? ctx->ndh : 1,
+		                     ctx->gridSize,
+		                     ctx->blockSize,
+		                     0,
+		                     args);
+		return reduxCleanup(ctx, ret);
 	}else{
-		ctx->ret = GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-
-	gpudata_release(ctx->srcStepsGD);
-	gpudata_release(ctx->srcSizeGD);
-	gpudata_release(ctx->chunkSizeGD);
-	gpudata_release(ctx->dstMaxStepsGD);
-	gpudata_release(ctx->dstArgmaxStepsGD);
-
-	return ctx->ret;
 }
 
 /**
  * Cleanup
  */
 
-static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx){
+static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
 	free(ctx->axisList);
 	free(ctx->sourceCode);
-	ctx->axisList       = NULL;
-	ctx->sourceCode     = NULL;
+	ctx->axisList   = NULL;
+	ctx->sourceCode = NULL;
+
+	gpudata_release(ctx->srcStepsGD);
+	gpudata_release(ctx->srcSizeGD);
+	gpudata_release(ctx->chunkSizeGD);
+	gpudata_release(ctx->dstStepsGD);
+	gpudata_release(ctx->dstArgStepsGD);
+	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
+	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
 
-	return ctx->ret;
+	return ret;
 }
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 5138e5c02d..3bcfaea54e 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -67,7 +67,7 @@ static       double   pcgRand01(void){
  * Test cases.
  */
 
-START_TEST(test_reduction){
+START_TEST(test_maxandargmax_reduction){
 	pcgSeed(1);
 
 	/**
@@ -155,7 +155,7 @@ START_TEST(test_reduction){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_idxtranspose){
+START_TEST(test_maxandargmax_idxtranspose){
 	pcgSeed(1);
 
 	/**
@@ -247,7 +247,7 @@ START_TEST(test_idxtranspose){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_veryhighrank){
+START_TEST(test_maxandargmax_veryhighrank){
 	pcgSeed(1);
 
 	/**
@@ -348,7 +348,7 @@ START_TEST(test_veryhighrank){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_alldimsreduced){
+START_TEST(test_maxandargmax_alldimsreduced){
 	pcgSeed(1);
 
 	/**
@@ -435,16 +435,193 @@ START_TEST(test_alldimsreduced){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
+START_TEST(test_minandargmin_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin) *         dims[1]        );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmin = 0;
+		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = i*dims[2] + k;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMin    == pMin[j],    "Min value mismatch!");
+		ck_assert_msg(gtArgmin == pArgmin[j], "Argmin value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
+START_TEST(test_minandargmin_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin)                          );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	size_t gtArgmin = 0;
+	float  gtMin    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = (i*dims[1] + j)*dims[2] + k;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
+	ck_assert_msg(gtArgmin == pArgmin[0], "Argmin value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
 Suite *get_suite(void) {
 	Suite *s  = suite_create("reduction");
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 15.0);
 
-	tcase_add_test(tc, test_reduction);
-	tcase_add_test(tc, test_idxtranspose);
-	tcase_add_test(tc, test_veryhighrank);
-	tcase_add_test(tc, test_alldimsreduced);
+	tcase_add_test(tc, test_maxandargmax_reduction);
+	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	tcase_add_test(tc, test_maxandargmax_veryhighrank);
+	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
+	tcase_add_test(tc, test_minandargmin_reduction);
+	tcase_add_test(tc, test_minandargmin_alldimsreduced);
 
 	suite_add_tcase(s, tc);
 	return s;