From 2c4a3cb2c3a9d6cc1d2d3c91ecd6274e471e4060 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 15 May 2017 14:43:26 -0400 Subject: [PATCH] Current State --- src/gpuarray/reduction.h | 91 +- src/gpuarray_reduction.c | 1827 ++++++++++++++++++++++++++------------ tests/check_reduction.c | 86 +- 3 files changed, 1309 insertions(+), 695 deletions(-) diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h index 1db5664535..f6638c9a83 100644 --- a/src/gpuarray/reduction.h +++ b/src/gpuarray/reduction.h @@ -26,28 +26,26 @@ extern "C" { */ typedef enum _ga_reduce_op { - GA_REDUCE_SUM, /* + */ - GA_REDUCE_PROD, /* * */ - GA_REDUCE_PRODNZ, /* * (!=0) */ - GA_REDUCE_MIN, /* min() */ - GA_REDUCE_MAX, /* max() */ - GA_REDUCE_ARGMIN, /* argmin() */ - GA_REDUCE_ARGMAX, /* argmax() */ - GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ - GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ - GA_REDUCE_AND, /* & */ - GA_REDUCE_OR, /* | */ - GA_REDUCE_XOR, /* ^ */ - GA_REDUCE_ALL, /* &&/all() */ - GA_REDUCE_ANY, /* ||/any() */ + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ } ga_reduce_op; /** - * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0), - * min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&), - * or (|), xor (^), all (&&) or any (||) over a list of axes to reduce. + * @brief Compute a reduction over a list of axes to reduce. * * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination * tensors. The destination tensor(s)' axes are a strict subset of the axes of the @@ -55,6 +53,7 @@ typedef enum _ga_reduce_op { * reduction is performed over these axes, which are then removed in the * destination. * + * @param [in] op The reduction operation to perform. * @param [out] dst The destination tensor. Has the same type as the source. * @param [out] dstArg For argument of minima/maxima operations. Has type int64. * @param [in] src The source tensor. @@ -81,64 +80,6 @@ typedef enum _ga_reduce_op { * code otherwise. */ -GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, GpuArray* dst, GpuArray* dstArg, diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index c9ccae66c4..8144900613 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "gpuarray/config.h" #include #include @@ -33,47 +34,96 @@ /* Datatypes */ +/** + * @brief Axis Description. + */ + +struct axis_desc{ + int reduxNum; + unsigned isReduced : 1; + unsigned isHW : 1; + unsigned isSW : 1; + size_t warpLen; + size_t len; + ssize_t srcStride, srcOffset; + ssize_t dstStride, dstOffset; + ssize_t dstArgStride, dstArgOffset; + ssize_t tmpDstStride, tmpDstOffset; + ssize_t tmpDstArgStride, tmpDstArgOffset; +}; +typedef struct axis_desc axis_desc; + /** * Reduction Kernel Generator. - * - * The generator produces a kernel from one of two "code models": - * - Large - * - Small - * Which one is used depends on the size of the destination tensor and the - * number of reductions for each destination element. A destination tensor - * with more than SMALL_REDUX_THRESHOLD elements or more elements than - * reductions for each element will result in use of the large code model; - * Otherwise the small code model is used. - * - * - * LARGE CODE MODEL: - * - * In the large code model, each destination element is processed by a - * single thread. - * - * Each thread begins with an initial value in a register, reads from all - * source elements contributing to the reduction, computes the result and - * writes it to the destination element. - * - * A single kernel is generated that performs prescalar transformations, the - * reduction itself, postscalar transformations and the write to global memory. - * - * - * SMALL CODE MODEL: - * - * In the small code model, each destination element is processed by - * multiple threads. - * - * The destination tensor is first initialized with the initial value. Then, - * one several threads cooperate to perform the reduction atomically on each - * destination element. Lastly, postscalar transformations are applied - * in-place. - * - * Two or three kernels are generated: The initialization kernel, the main - * kernel that performs prescalar transformations and the reduction itself, and - * possibly also a postscalar transformation kernel when it is required. - * - * + * + * INTRO + * + * Generates the source code for a reduction kernel over arbitrarily-dimensioned, + * -shaped and -typed tensors. + * + * + * GOALS + * + * The generator has the following goals: + * + * 1. Maximizing the use of coalesced memory loads within a warp. + * 2. Maximizing the # of useful threads within a warp. + * 3. Maximizing the number of warps within a block. + * + * NOTE: It is possible to guarantee for any tensor problem of at least + * 2*WARP_SIZE in scale that either + * 1. All warp blocks in the X dimension have more than 50% threads + * active 100% of the time, or + * 2. The warp blocks in the X dimension have 100% threads active more + * than 50% of the time. + * + * 4. Ensuring there are no more blocks than are permitted by the warp + * configuration and 2nd-stage workspace size (if required). + * 5. Ensuring there are no more than 5 blocks per multiprocessor. + * 6. Minimizing the 2nd-stage workspace (if it is required). + * 7. Striding the 2nd-stage workspace for maximum convenience (if it is + * required). Make it contiguous. + * + * + * NOTES + * + * Information elements required to perform reduction. + * + * 1. Ndim, shape and dtype of src tensor + * 2. Ndim, shape and dtype of dst/dstArg tensors + * 3. GPU context + * 4. Number of processors + * 5. Warp size + * 6. Maximum size of block + * 7. Maximum size of block dimension X, Y, Z + * 8. Maximum size of grid + * 9. Maximum size of grid dimension X, Y, Z + * 10. Dtype and initializer of accumulator + * 11. Sorted src axes for contiguous memory accesses + * 12. Ndim, shape and dtype of flattened src tensor + * 13. Number of stages (1 or 2) + * 14. Ndim, shape and dtype of workspace tensor + * 15. Warp axes + * 16. Hardware axes + * 17. Software axes + * 18. Source code + * + * Rationale for dependencies: + * + * 1) Get the GPU context and its properties immediately, since an invalid + * context is a likely error and we want to fail fast. + * 2) The type and initializer of the accumulator should be determined after + * the context's properties have been retrieved since they provide + * information about the device's natively-supported types and operations. + * + * REFERENCES + * + * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf + * + * + * + * + * * Kernel Template: * * The following kernel code template displays the code generated for the @@ -200,11 +250,41 @@ struct redux_ctx{ const int* reduxList; /* General. */ + int nds; /* # Source dimensions */ + int ndr; /* # Reduced dimensions */ + int ndd; /* # Destination dimensions */ + int ndw; /* # Warp dimensions */ + int ndp; /* # Partial warp dimensions */ + int ndf; /* # Flattened source dimensions */ + int ndt; /* # Temporary workspace dimensions */ + int zeroAllAxes; /* # of zero-length axes in source tensor */ + int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ + size_t prodAllAxes; /* Product of length of all axes in source tensor */ + size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ + size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ + size_t prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */ + int splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */ + + gpucontext* gpuCtx; + unsigned numProcs; + size_t warpSize; + size_t maxLg; + size_t maxLs[MAX_HW_DIMS]; + size_t maxGg; + size_t maxGs[MAX_HW_DIMS]; + + axis_desc* xdSrc; + axis_desc* xdSrcFlat; + axis_desc* xdTmp; + + axis_desc** xdSrcPtrs; + + int numStages; + GpuArray* wsDst; GpuArray* wsDstArg; int* srcAxisList; size_t* dstDims; - gpucontext* gpuCtx; /* Source code Generator. */ int srcTypeCode; @@ -219,9 +299,6 @@ struct redux_ctx{ const char* accTypeStr; const char* initValT; const char* initValK; - int ndd; - int ndr; - int nds; int largeCodeModel; strb s; srcb srcGen; @@ -269,186 +346,133 @@ typedef struct redux_ctx redux_ctx; -/* Function prototypes */ -static int reduxGetSumInit (int typecode, const char** property); -static int reduxGetProdInit (int typecode, const char** property); -static int reduxGetMinInit (int typecode, const char** property); -static int reduxGetMaxInit (int typecode, const char** property); -static int reduxGetAndInit (int typecode, const char** property); -static int reduxGetOrInit (int typecode, const char** property); -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); -static int reduxCheckargs (redux_ctx* ctx); -static void reduxSelectTypes (redux_ctx* ctx); -static int reduxIsSmallCodeModel (redux_ctx* ctx); -static int reduxIsLargeCodeModel (redux_ctx* ctx); -static int reduxRequiresDst (redux_ctx* ctx); -static int reduxRequiresDstArg (redux_ctx* ctx); -static int reduxKernelRequiresDst (redux_ctx* ctx); -static int reduxKernelRequiresDstArg (redux_ctx* ctx); -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType); -static int reduxSelectHwAxes (redux_ctx* ctx); -static int reduxComputeAxisList (redux_ctx* ctx); -static int reduxGenSource (redux_ctx* ctx); -static void reduxAppendSource (redux_ctx* ctx); -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName); -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName); -static void reduxAppendMacroDefs (redux_ctx* ctx); -static void reduxAppendTypedefs (redux_ctx* ctx); -static void reduxAppendGetInitValFns (redux_ctx* ctx); -static void reduxAppendWriteBackFn (redux_ctx* ctx); -static void reduxAppendReduxKernel (redux_ctx* ctx); -static void reduxAppendPrototype (redux_ctx* ctx); -static void reduxAppendIndexDeclarations (redux_ctx* ctx); -static void reduxAppendRangeCalculations (redux_ctx* ctx); -static void reduxAppendLoops (redux_ctx* ctx); -static void reduxAppendInitKernel (redux_ctx* ctx); -static void reduxAppendPostKernel (redux_ctx* ctx); -static int reduxCompile (redux_ctx* ctx); -static int reduxSchedule (redux_ctx* ctx); -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs); -static int reduxInvoke (redux_ctx* ctx); -static int reduxCleanup (redux_ctx* ctx, int ret); +/* Static Function prototypes */ +/* Utilities */ +static int reduxGetSumInit (int typecode, const char** property); +static int reduxGetProdInit (int typecode, const char** property); +static int reduxGetMinInit (int typecode, const char** property); +static int reduxGetMaxInit (int typecode, const char** property); +static int reduxGetAndInit (int typecode, const char** property); +static int reduxGetOrInit (int typecode, const char** property); +static int reduxSortFlatSensitive (const void* a, const void* b); +static int reduxSortFlatInsensitive (const void* a, const void* b); +static int reduxSortWarp (const void* a, const void* b); +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); + +/* Axis Description API */ +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t srcStride); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static void axisMarkWarp (axis_desc* axis, size_t partialSlice); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static ssize_t axisGetSrcStride (const axis_desc* axis); +static size_t axisGetSrcAbsStride (const axis_desc* axis); +static ssize_t axisGetSrcOffset (const axis_desc* axis); +static ssize_t axisGetDstStride (const axis_desc* axis); +static size_t axisGetDstAbsStride (const axis_desc* axis); +static ssize_t axisGetDstOffset (const axis_desc* axis); +static ssize_t axisGetDstArgStride (const axis_desc* axis); +static size_t axisGetDstArgAbsStride (const axis_desc* axis); +static ssize_t axisGetDstArgOffset (const axis_desc* axis); +static int axisIsReduced (const axis_desc* axis); +static int axisIsWarp (const axis_desc* axis); +static int axisIsPartialWarp (const axis_desc* axis); + +/* Reduction Context API */ +/* Utilities */ +static int reduxRequiresDst (const redux_ctx* ctx); +static int reduxRequiresDstArg (const redux_ctx* ctx); +static int reduxKernelRequiresDst (const redux_ctx* ctx); +static int reduxKernelRequiresDstArg (const redux_ctx* ctx); +static int reduxIsSensitive (const redux_ctx* ctx); +static int reduxIsSmallCodeModel (const redux_ctx* ctx); +static int reduxIsLargeCodeModel (const redux_ctx* ctx); +static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i); +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from); +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType); +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType); +/* Control Flow */ +static int reduxInit (redux_ctx* ctx); +static int reduxInferProperties (redux_ctx* ctx); +static int reduxFlattenSource (redux_ctx* ctx); +static int reduxSelectWarpAxes (redux_ctx* ctx); +static int reduxSelectNumStages (redux_ctx* ctx); +static int reduxSelectHwAxes (redux_ctx* ctx); +static int reduxComputeAxisList (redux_ctx* ctx); +static int reduxGenSource (redux_ctx* ctx); +static void reduxAppendSource (redux_ctx* ctx); +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName); +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName); +static void reduxAppendMacroDefs (redux_ctx* ctx); +static void reduxAppendTypedefs (redux_ctx* ctx); +static void reduxAppendGetInitValFns (redux_ctx* ctx); +static void reduxAppendWriteBackFn (redux_ctx* ctx); +static void reduxAppendReduxKernel (redux_ctx* ctx); +static void reduxAppendPrototype (redux_ctx* ctx); +static void reduxAppendIndexDeclarations (redux_ctx* ctx); +static void reduxAppendRangeCalculations (redux_ctx* ctx); +static void reduxAppendLoops (redux_ctx* ctx); +static void reduxAppendInitKernel (redux_ctx* ctx); +static void reduxAppendPostKernel (redux_ctx* ctx); +static int reduxCompile (redux_ctx* ctx); +static int reduxSchedule (redux_ctx* ctx); +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); +static int reduxInvoke (redux_ctx* ctx); +static int reduxCleanup (redux_ctx* ctx, int ret); +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); /* Function implementation */ -GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_SUM, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_PROD, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_PRODNZ, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MIN, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MAX, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ARGMIN, - NULL, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ARGMAX, - NULL, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MINANDARGMIN, - dst, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, - dst, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_AND, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_OR, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_XOR, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ALL, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ANY, - dst, NULL, src, reduxLen, reduxList); -} GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, GpuArray* dst, GpuArray* dstArg, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - redux_ctx ctxSTACK = {op, dst, dstArg, src, - (int)reduxLen, (const int*)reduxList}; - redux_ctx *ctx = &ctxSTACK; + redux_ctx ctxSTACK, *ctx = &ctxSTACK; + memset(ctx, 0, sizeof(*ctx)); - return reduxCheckargs(ctx); + ctx->op = op; + ctx->dst = dst; + ctx->dstArg = dstArg; + ctx->src = src; + ctx->reduxLen = reduxLen; + ctx->reduxList = (const int*)reduxList; + + return reduxInit(ctx); } /** @@ -462,7 +486,7 @@ GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetSumInit (int typecode, const char** property){ +static int reduxGetSumInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -482,7 +506,7 @@ static int reduxGetSumInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetProdInit (int typecode, const char** property){ +static int reduxGetProdInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -502,7 +526,7 @@ static int reduxGetProdInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMinInit (int typecode, const char** property){ +static int reduxGetMinInit (int typecode, const char** property){ switch (typecode){ case GA_BYTE2: case GA_BYTE3: @@ -592,7 +616,7 @@ static int reduxGetMinInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMaxInit (int typecode, const char** property){ +static int reduxGetMaxInit (int typecode, const char** property){ switch (typecode){ case GA_BOOL: *property = "1"; @@ -691,7 +715,7 @@ static int reduxGetMaxInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetAndInit (int typecode, const char** property){ +static int reduxGetAndInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -711,7 +735,7 @@ static int reduxGetAndInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetOrInit (int typecode, const char** property){ +static int reduxGetOrInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -720,6 +744,110 @@ static int reduxGetOrInit (int typecode, const char** property) return GA_NO_ERROR; } +/** + * @brief Sort the axes into optimal order for flattening. + * + * Two orderings exist: "Sensitive" and "Insensitive", for reductions that are + * sensitive (or not) to indexing. + * + * In all cases: + * + * 1. Free axes are sorted before reduction axes. + * 2. Free axes are sorted by decreasing absolute stride. + * 3. then by increasing source axis number. + * + * In the sensitive case: + * + * 4. Reduction axes are sorted by their position in reduxList. + * + * In the insensitive case: + * + * 4. Reduction axes are sorted by decreasing absolute stride. + * 5. then by increasing source axis number. + */ + +static int reduxSortFlatInsensitive (const void* a, const void* b){ + const axis_desc* xda = (const axis_desc*)a; + const axis_desc* xdb = (const axis_desc*)b; + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return +1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return -1; + } + + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return +1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return -1; + } + + return 0; +} +static int reduxSortFlatSensitive (const void* a, const void* b){ + const axis_desc* xda = (const axis_desc*)a; + const axis_desc* xdb = (const axis_desc*)b; + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return +1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return -1; + } + + if (axisIsReduced(xda)){ + return axisGetReduxNum(xda) axisGetSrcAbsStride(xdb)){ + return -1; + } + + return 0; + } +} + +/** + * @brief Sort axes in preferred order for integration into warp. + * + * The axes with stride != 0 are sorted by lowest absolute + * stride. Picking the few axes with the lowest absolute stride (while + * keeping the product of their dimensions <= warpSize) should maximize + * memory bandwidth of the warp. + * + * The restriction stride != 0 is intended to avoid waste of memory + * bandwidth. Once a memory transaction is necessary, it typically operates at + * far greater granularity than just 32 bits (4 bytes). + * + * Sorting by absolute stride should result, in the case of a packed tensor, in + * the memory accesses being close to perfectly contiguous. + */ + +static int reduxSortWarp (const void* a, const void* b){ + const axis_desc* xda = *(const axis_desc* const *)a; + const axis_desc* xdb = *(const axis_desc* const *)b; + + if ( axisGetSrcStride(xda) && !axisGetSrcStride(xdb)){ + return -1; + }else if (!axisGetSrcStride(xda) && axisGetSrcStride(xdb)){ + return +1; + } + + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return -1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return +1; + } + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return -1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return +1; + } + + return 0; +} + /** * @brief Check whether axis numbered v is already in the given set of axes. * @@ -730,10 +858,10 @@ static int reduxGetOrInit (int typecode, const char** property) * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where){ +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where){ size_t i; for (i=0;ireduxNum = -1; + axis->warpLen = 0; + axis->len = len; + + axis->srcStride = srcStride; + axis->srcOffset = 0; + + axis->dstStride = 0; + axis->dstOffset = 0; + + axis->dstArgStride = 0; + axis->dstArgOffset = 0; + + axis->tmpDstStride = 0; + axis->tmpDstOffset = 0; + + axis->tmpDstArgStride = 0; + axis->tmpDstArgOffset = 0; +} + /** - * @brief Check the sanity of the arguments in agreement with the - * documentation for GpuArray_reduction(). + * @brief Mark axis as reduction axis, with position reduxNum in the axis list. + */ + +static void axisMarkReduced (axis_desc* axis, int reduxNum){ + axis->isReduced = 1; + axis->reduxNum = reduxNum; +} + +/** + * @brief Mark axis as warp axis. + */ + +static void axisMarkWarp (axis_desc* axis, size_t warpLen){ + axis->warpLen = warpLen; +} + +/** + * @brief Get properties of an axis. + */ + +static int axisGetReduxNum (const axis_desc* axis){ + return axis->reduxNum; +} +static size_t axisGetLen (const axis_desc* axis){ + return axis->len; +} +static ssize_t axisGetSrcStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->srcStride : 0; +} +static size_t axisGetSrcAbsStride (const axis_desc* axis){ + return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis): + +(size_t)axisGetSrcStride(axis); +} +static ssize_t axisGetSrcOffset (const axis_desc* axis){ + return axis->srcOffset; +} +static ssize_t axisGetDstStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->dstStride : 0; +} +static size_t axisGetDstAbsStride (const axis_desc* axis){ + return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis): + +(size_t)axisGetDstStride(axis); +} +static ssize_t axisGetDstOffset (const axis_desc* axis){ + return axis->dstOffset; +} +static ssize_t axisGetDstArgStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->dstArgStride : 0; +} +static size_t axisGetDstArgAbsStride (const axis_desc* axis){ + return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis): + +(size_t)axisGetDstArgStride(axis); +} +static ssize_t axisGetDstArgOffset (const axis_desc* axis){ + return axis->dstArgOffset; +} +static int axisIsReduced (const axis_desc* axis){ + return axis->isReduced; +} +static int axisIsWarp (const axis_desc* axis){ + return !!axis->warpLen; +} +static int axisIsPartialWarp (const axis_desc* axis){ + return axis->warpLen > 0 && axis->warpLen != axis->len; +} + +/** + * @brief Returns whether the reduction interface requires a dst argument. + */ + +static int reduxRequiresDst (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 0; + default: + return 1; + } +} + +/** + * @brief Returns whether the reduction interface requires a dstArg argument. + */ + +static int reduxRequiresDstArg (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} + +/** + * @brief Returns whether the generated kernel internally requires a dst + * argument. * - * Also initialize certain parts of the context, allocate memory - * buffers and fail out if at any point the environment gives us - * a problem. + * This is semantically subtly different from reduxHasDst(). The main + * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX + * reductions; Either *might* require a dst buffer, which will have to be + * allocated, even though it will be discared. + */ + +static int reduxKernelRequiresDst (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return reduxIsSmallCodeModel(ctx); + default: + return 1; + } +} + +/** + * @brief Returns whether the generated kernel internally requires a dstArg + * argument. * - * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of - * memory, GA_NO_ERROR otherwise. + * This is semantically subtly different from reduxHasDstArg(), since it asks + * whether the reduction, even though it does not accept a dstArg argument, + * still requires a dstArg internally. + */ + +static int reduxKernelRequiresDstArg (const redux_ctx* ctx){ + /** + * At present there exists no reduction whose implementation requires + * a dstArg but whose interface does not. + * + * E.g. the max() and min() reductions do NOT currently require a temporary + * buffer for indexes, and will not in the foreseeable future. + */ + + return reduxRequiresDstArg(ctx); +} + +/** + * @brief Returns whether the reduction is sensitive. + * + * A reduction is sensitive when its output satisfies at least one of the + * following conditions: + * + * - It depends on the exact order of axes in the reduxList + * - It depends on exact signs of the strides of axes in the reduxList + * + * Such sensitivity may prevent a flattening of contiguous axes even when it + * would have been otherwise permitted. + * + * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg + * tensor's contents are flattened coordinates into the source tensor, and + * the flattening order is precisely reduxList. Permuting it would thus produce + * incorrect output. Moreover, if the strides of a reduction axis were to be + * reversed for the purpose of flattening the axis into another, the computed + * coordinate would again be incorrect. + * + * + * TL;DR: Reduction is sensitive if + * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) + * or + * reduce(x) != reduce(x[::-1]) + * . + */ + +static int reduxIsSensitive (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} + +/** + * @brief Returns whether we are using the small code model or not. + */ + +static int reduxIsSmallCodeModel (const redux_ctx* ctx){ + return !reduxIsLargeCodeModel(ctx); +} + +/** + * @brief Returns whether we are using the large code model or not. + */ + +static int reduxIsLargeCodeModel (const redux_ctx* ctx){ + return ctx->largeCodeModel; +} + +/** + * @brief Get description of source axis with given number. + */ + +static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i){ + return &ctx->xdSrc[i]; +} + +/** + * @brief Get description of source axis with given number in sort-order. */ -static int reduxCheckargs (redux_ctx* ctx){ - int i, j, ret, retT, retK; - unsigned numProcs; - size_t localSize; - size_t dstNumElem = 1, reduxPerElem = 1; +static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i){ + return ctx->xdSrcPtrs[i]; +} + +/** + * @brief Get description of flattened source axis with given number. + */ + +static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i){ + return &ctx->xdSrcFlat[i]; +} + +/** + * @brief Attempt to flatten an axis `from` into an axis `into`. + * + * An axis can be considered for flattening into the previous one if ALL of + * the following conditions hold: + * + * 1. The product of the previous axis' length by its stride exactly + * matches the current axis' stride. + * 2. Both axes are reduced. + * + * For reductions where axis order matters (e.g. those that compute + * indices, like argmax/argmin), ALL of the following additional conditions + * must hold: + * + * 3. The sign of the strides must match. + * 4. The axis numbers must follow consecutively in the reduction list + * (this is ensured by the "sensitive" sort order) + * + * @return Non-zero if flattening attempt successful; Zero otherwise. + */ + +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from){ + int signSrc = 0, signDst = 0, signDstArg = 0, + reverseSrc = 0, reverseDst = 0, reverseDstArg = 0; + + if (axisIsReduced (into) != axisIsReduced (from) || + axisGetSrcAbsStride (into) != axisGetSrcAbsStride (from)*axisGetLen(from)){ + return 0; + } + + if (reduxRequiresDst(ctx) && + axisGetDstAbsStride (into) != axisGetDstAbsStride (from)*axisGetLen(from)){ + return 0; + } + + if (reduxRequiresDstArg(ctx) && + axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){ + return 0; + } + + signSrc = (axisGetSrcStride (into)^axisGetSrcStride (from)) < 0; + signDst = (axisGetDstStride (into)^axisGetDstStride (from)) < 0; + signDstArg = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0; + reverseSrc = signSrc; + reverseDst = signDst && reduxRequiresDst (ctx); + reverseDstArg = signDstArg && reduxRequiresDstArg(ctx); + + if (reduxIsSensitive(ctx)){ + if(reverseSrc || reverseDst || reverseDstArg){ + return 0; + } + } + + if (reduxRequiresDst (ctx) && + reduxRequiresDstArg(ctx) && + reverseDst != reverseDstArg){ + /* Either both, or neither, of dst and dstArg must require reversal. */ + return 0; + } + + if (reverseSrc){ + into->srcOffset += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from); + into->srcStride = -axisGetSrcStride (from); + }else{ + into->srcStride = axisGetSrcStride (from); + } + + if (reverseDst){ + into->dstOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from); + into->dstStride = -axisGetDstStride (from); + }else{ + into->dstStride = axisGetDstStride (from); + } + + if (reverseDstArg){ + into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from); + into->dstArgStride = -axisGetDstArgStride(from); + }else{ + into->dstArgStride = axisGetDstArgStride(from); + } + + into->srcOffset += axisGetSrcOffset (from); + into->dstOffset += axisGetDstOffset (from); + into->dstArgOffset += axisGetDstArgOffset(from); + into->len *= axisGetLen (from); + + return 1; +} + +/** + * @brief Check whether we can add another reduction axis or free axis + * to the hardware axis list for either the primary or secondary kernel. + */ + +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType){ + int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; + int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; + int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + + if (kernelNdh >= MAX_HW_DIMS){ + return 0; + }else{ + return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: + kernelNdhd < ctx->ndd; + } +} + +/** + * @brief Append the largest reduction axis or free axis that isn't yet + * in the hardware axis list for either the primary or secondary kernel + * into said hardware axis list. + */ + +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType){ + int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; + int* hwAxisList, * ndh, * ndhr, * ndhd; + size_t v, maxV = 0; + + /* Get pointers to the correct kernel's variables */ + hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: + ctx->aux.axisList; + ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: + &ctx->aux.ndh; + ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: + &ctx->aux.ndhr; + ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: + &ctx->aux.ndhd; + + /* Find */ + for (i=0;inds;i++){ + isInHwList = axisInSet(i, hwAxisList, *ndh, 0); + isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); + isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: + !isInReduxList; + v = ctx->src->dimensions[i]; + isLargestSoFar = v >= maxV; + + if (!isInHwList && isInDesiredList && isLargestSoFar){ + maxV = v; + maxI = i; + } + } + + /* Append */ + hwAxisList[(*ndh)++] = maxI; + if (axisType == AXIS_REDUX){ + (*ndhr)++; + }else{ + (*ndhd)++; + } +} + +/** + * @brief Initialize the context. + * + * After this function, calling reduxCleanup() becomes safe. + */ + +static int reduxInit (redux_ctx* ctx){ + int i; /** * We initialize certain parts of the context. @@ -813,15 +1338,16 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = ctx->accTypeStr = ctx->idxTypeStr = NULL; - ctx->initValK = NULL; - ctx->pri.ndh = ctx->aux.ndh = 0; - ctx->pri.ndhd = ctx->aux.ndhd = 0; - ctx->pri.ndhr = ctx->aux.ndhr = 0; + ctx->initValK = NULL; ctx->sourceCode = NULL; - ctx->sourceCodeLen = 0; ctx->errorString0 = NULL; ctx->errorString1 = NULL; ctx->errorString2 = NULL; + + ctx->splitWarpAxis = -1; + ctx->numStages = 1; + ctx->prodWarpAxes = 1; + ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; strb_init(&ctx->s); srcbInit (&ctx->srcGen, &ctx->s); @@ -835,219 +1361,134 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->srcStepsGD = ctx->srcSizeGD = ctx->dstStepsGD = ctx->dstArgStepsGD = ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; - /* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */ - - /* Insane src, reduxLen, dst or dstArg? */ - if (!ctx->src || - (reduxRequiresDst (ctx) && !ctx->dst) || - (reduxRequiresDstArg(ctx) && !ctx->dstArg) || - (ctx->src->nd <= 0) || - (ctx->reduxLen <= 0) || - (ctx->src->nd < (unsigned)ctx->reduxLen) || - (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd) || - (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - - - /* Insane or duplicate list entry? */ - for (i=0;ireduxLen;i++){ - if (ctx->reduxList[i] < 0 || - ctx->reduxList[i] >= (int)ctx->src->nd || - axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - } + return reduxInferProperties(ctx); +} +/** + * @brief Begin inferring the properties of the reduction. + */ - /* GPU context non-existent? */ - ctx->gpuCtx = GpuArray_context(ctx->src); - if (!ctx->gpuCtx){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } +static int reduxInferProperties (redux_ctx* ctx){ + axis_desc* a; + int i, j, retT, retK; + size_t d; - /* Unknown type? */ - reduxSelectTypes(ctx); - if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr || - !ctx->accTypeStr){ - return reduxCleanup(ctx, GA_INVALID_ERROR); + /* Source code buffer preallocation failed? */ + if (strb_ensure(&ctx->s, 4*1024) != 0){ + return reduxCleanupMsg(ctx, GA_MEMORY_ERROR, + "Could not preallocate source code buffer!\n"); } - /* Determine initializer, and error out if reduction unsupported. */ - switch (ctx->op){ - case GA_REDUCE_SUM: - retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_PRODNZ: - case GA_REDUCE_PROD: - retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_MIN: - retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMAX: - case GA_REDUCE_MAX: - retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_ALL: - case GA_REDUCE_AND: - retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_ANY: - case GA_REDUCE_XOR: - case GA_REDUCE_OR: - retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); - break; - default: - retT = GA_UNSUPPORTED_ERROR; - retK = GA_UNSUPPORTED_ERROR; - } - if (retT != GA_NO_ERROR){ - return reduxCleanup(ctx, retT); - } - if (retK != GA_NO_ERROR){ - return reduxCleanup(ctx, retK); + /* Insane src, reduxLen, dst or dstArg? */ + if (!ctx->src){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src is NULL!\n"); + }else if (ctx->src->nd <= 0){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src has less than 1 dimensions!\n"); + }else if (ctx->reduxLen <= 0){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "List of dimensions to be reduced is empty!\n"); + }else if (ctx->src->nd < (unsigned)ctx->reduxLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src has fewer dimensions than there are dimensions to reduce!\n"); + }else if (reduxRequiresDst (ctx) && !ctx->dst){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is NULL, but reduction requires it!\n"); + }else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is NULL, but reduction requires it!\n"); + }else if (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is of incorrect dimensionality for this reduction!\n"); + }else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is of incorrect dimensionality for this reduction!\n"); } - - - /** - * We initialize some more parts of the context, using the guarantees - * we now have about the sanity of the arguments. - */ - ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; - strb_ensure(&ctx->s, 3*1024); - - - /** - * And make a few small dynamic memory allocations for the benefit of the - * rest of the code, allowing error checking to happen early and fail fast. - */ - - ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); - ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); - if (!ctx->srcAxisList || - !ctx->dstDims ){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); + ctx->ndw = 0; + ctx->ndp = 0; + ctx->ndf = 0; + ctx->ndt = ctx->ndd + 1; + + /* Insane reduxList? */ + for (i=0;indr;i++){ + j = ctx->reduxList[i]; + if (j < -ctx->nds || j >= ctx->nds){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Insane axis number %d! Should be [%d, %d)!\n", + j, -ctx->nds, ctx->nds); + } + j = j<0 ? ctx->nds+j : j; + d = ctx->src->dimensions[j]; + ctx->zeroRdxAxes += !d; + ctx->prodRdxAxes *= d?d:1; } /** - * Query device for approximate total level of parallelism. If destination - * tensor is so big it can keep all threads busy on individual elements, - * use large code model; Otherwise use small code model, where threads will - * have to cooperate. - * - * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or - * destination tensor size >= # of reductions per destination - * tensor element): - * All destination elements have their own thread. - * - Small (otherwise): - * Multiple threads cooperate on a single destination element. + * Insane shape? + * + * The source tensor is allowed to be empty (its shape may contain 0s). + * However, all axes that are of length 0 must be reduction axes. + * + * The reason for this is that a reduction cannot store any output into an + * empty destination tensor (whose dimensions are the free axes), because + * it has 0 space. The operation cannot then fulfill its contract. + * + * On the other hand, when some or all reduction axes of a tensor are of + * length 0, the reduction can be interpreted as initializing the + * destination tensor to the identity value of the operation. For lack of a + * better idea, the destination argument tensor can then be zeroed. */ - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + for (i=0;inds;i++){ + d = ctx->src->dimensions[i]; + ctx->zeroAllAxes += !d; + ctx->prodAllAxes *= d?d:1; } - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + if (ctx->zeroAllAxes != ctx->zeroRdxAxes){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source tensor has length-0 dimensions that are not reduced!"); } + ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes; - for (i=j=0;inds;i++){ - if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ - reduxPerElem *= ctx->src->dimensions[i]; - }else{ - dstNumElem *= ctx->src->dimensions[i]; - ctx->dstDims[j++] = ctx->src->dimensions[i];; - } - } - ctx->largeCodeModel = dstNumElem >= numProcs*localSize || - dstNumElem >= reduxPerElem - || 1;/* BUG: Erase when small code model implemented. */ /** - * *** IT IS NOW SAFE TO CALL: *** - * - reduxIsLargeModel() - * - reduxIsSmallModel() - * - reduxKernelRequiresDst() - * - reduxKernelRequiresDstArg() + * GPU context non-existent, or cannot read its properties? */ - - /** - * Allocate workspaces. - * - * Certain reductions may require a workspace that isn't provided by the user. - * For instance, **when using the small code model**, argmin/argmax require - * a dst buffer, but the user didn't supply one (as he would have for - * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. - * - * Otherwise we use the user-supplied buffers. - */ - - if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ - ctx->wsDst = malloc(sizeof(*ctx->wsDst)); - if (!ctx->wsDst){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - }else{ - ctx->wsDst = ctx->dst; - } - if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ - ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); - if (!ctx->wsDstArg){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - }else{ - ctx->wsDstArg = ctx->dstArg; + ctx->gpuCtx = GpuArray_context(ctx->src); + if (!ctx->gpuCtx || + gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &ctx->numProcs) != GA_NO_ERROR || + gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &ctx->maxLg) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE, &ctx->maxGg) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){ + /* gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); */ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Error obtaining one or more properties from GPU context!\n"); } + ctx->warpSize = 32; + /** + * Type management. + * + * - Deal with the various typecodes. + * - Determine initializer and error out if reduction unsupported on that + * datatype. + */ - return reduxSelectHwAxes(ctx); -} - -/** - * @brief Select types for the reduction kernel's implementation. - * - * There are 5 types of relevance: - * - Source (S=Source) - * - Destination (T=Target) - * - Destination Argument (A=Arg) - * - Index (X=indeX) - * - Accumulator (K=aKKumulator/reduction) - */ - -static void reduxSelectTypes (redux_ctx* ctx){ - /* Deal with the various typecodes. */ ctx->srcTypeCode = ctx->src->typecode; ctx->dstTypeCode = ctx->srcTypeCode; ctx->dstArgTypeCode = GA_SSIZE; @@ -1059,179 +1500,330 @@ static void reduxSelectTypes (redux_ctx* ctx){ case GA_HALF2: ctx->accTypeCode = GA_FLOAT2; break; - case GA_HALF4: - ctx->accTypeCode = GA_FLOAT4; + case GA_HALF4: + ctx->accTypeCode = GA_FLOAT4; + break; + case GA_HALF8: + ctx->accTypeCode = GA_FLOAT8; + break; + case GA_HALF16: + ctx->accTypeCode = GA_FLOAT16; + break; + default: + ctx->accTypeCode = ctx->srcTypeCode; + } + ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; + ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; + ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; + ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; + ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; + if (!ctx->srcTypeStr || + !ctx->dstTypeStr || + !ctx->dstArgTypeStr || + !ctx->idxTypeStr || + !ctx->accTypeStr ){ + return reduxCleanup(ctx, GA_INVALID_ERROR); + } + switch (ctx->op){ + case GA_REDUCE_SUM: + retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_PRODNZ: + case GA_REDUCE_PROD: + retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_MIN: + retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMAX: + case GA_REDUCE_MAX: + retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); break; - case GA_HALF8: - ctx->accTypeCode = GA_FLOAT8; + case GA_REDUCE_ALL: + case GA_REDUCE_AND: + retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); break; - case GA_HALF16: - ctx->accTypeCode = GA_FLOAT16; + case GA_REDUCE_ANY: + case GA_REDUCE_XOR: + case GA_REDUCE_OR: + retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); break; default: - ctx->accTypeCode = ctx->srcTypeCode; + retT = GA_UNSUPPORTED_ERROR; + retK = GA_UNSUPPORTED_ERROR; + } + if (retT != GA_NO_ERROR){ + return reduxCleanupMsg(ctx, retT, + "Problem selecting types to be used in reduction!\n"); + } + if (retK != GA_NO_ERROR){ + return reduxCleanupMsg(ctx, retK, + "Problem selecting types to be used in reduction!\n"); } - /* Get the string version as well. */ - ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; - ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; - ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; - ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; - ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; -} - -/** - * @brief Returns whether we are using the small code model or not. - */ -static int reduxIsSmallCodeModel (redux_ctx* ctx){ - return !reduxIsLargeCodeModel(ctx); -} + /** + * Allocate and construct source-tensor axis-description lists. + * + * While constructing the descriptions of each axis, verify that: + * + * 1. reduxLen has no duplicates. + * 2. dst and/or dstArg's dimensions match src's dimensions, stripped of + * the reduction axes. + */ -/** - * @brief Returns whether we are using the large code model or not. - */ + ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); + ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs)); + ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat)); + ctx->xdTmp = calloc(ctx->ndt, sizeof(*ctx->xdTmp)); + if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + for (i=0;inds;i++){ + axisInit(&ctx->xdSrc[i], + ctx->src->dimensions[i], + ctx->src->strides[i]); + } + for (i=0;indr;i++){ + j = ctx->reduxList[i]; + j = j<0 ? ctx->nds+j : j; + a = reduxGetSrcAxis(ctx, j); + if (axisIsReduced(a)){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Axis %d appears multiple times in the " + "reduction axis list!\n", + j); + } + axisMarkReduced(a, i); + } + for (i=j=0;inds;i++){ + axis_desc* a = reduxGetSrcAxis(ctx, i); + size_t srcLen = axisGetLen(a), dstLen, dstArgLen; + + if (axisIsReduced(a)){continue;} + if (reduxRequiresDst(ctx)){ + dstLen = ctx->dst->dimensions[j]; + + if(srcLen != dstLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination axis %d has length %zu!\n", + i, srcLen, j, dstLen); + } + + a->dstStride = ctx->dst->strides[j]; + } + if (reduxRequiresDstArg(ctx)){ + dstArgLen = ctx->dstArg->dimensions[j]; + + if(srcLen != dstArgLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination-argument axis %d has length %zu!\n", + i, srcLen, j, dstArgLen); + } + + a->dstArgStride = ctx->dstArg->strides[j]; + } + + j++; + } -static int reduxIsLargeCodeModel (redux_ctx* ctx){ - return ctx->largeCodeModel; -} -/** - * @brief Returns whether the reduction interface requires a dst argument. - */ + /** + * Begin flattening the source tensor. + */ -static int reduxRequiresDst (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 0; - default: - return 1; - } + return reduxFlattenSource(ctx); } /** - * @brief Returns whether the reduction interface requires a dstArg argument. + * @brief Flatten the source tensor as much as is practical. + * + * This makes the axis lengths as long as possible and the tensor itself as + * contiguous as possible. */ -static int reduxRequiresDstArg (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; +static int reduxFlattenSource (redux_ctx* ctx){ + axis_desc* axis, *flatAxis, *sortAxis; + int i, j, isSensitive; + + /** + * Copy source axis descriptions list to flattened source axis description + * list, in preparation for attempts at flattening. + */ + + memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat)); + ctx->ndf = ctx->nds; + + /** + * Pass 1: Flatten out 0-length dimensions. We already know that + * + * a) There are no 0-length free dimensions, because that + * constitutes an invalid input, and + * b) How many 0-length reduction dimensions there are, because + * we counted them in the error-checking code. + * + * So if there are any 0-length axes, we can delete all reduction axes and + * replace them with a single one. + */ + + if (ctx->zeroRdxAxes > 0){ + for (i=j=0;indf;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + + if (!axisIsReduced(axis)){ + *reduxGetSrcFlatAxis(ctx, j++) = *axis; + } + } + + axisInit (reduxGetSrcFlatAxis(ctx, j), 0, 0); + axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0); + j++; + ctx->ndf = j; + } + + /** + * Pass 2: Flatten out 1-length dimensions, since they can always be + * ignored; They are always indexed at [0]. + */ + + for (i=j=0;indf;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + + if (axisGetLen(axis) != 1){ + *reduxGetSrcFlatAxis(ctx, j++) = *axis; + } + } + ctx->ndf = j; + + /** + * Pass 3: Flatten out continuous dimensions, where strides and sensitivity + * allows it. + */ + + isSensitive = reduxIsSensitive(ctx); + + qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat), + isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); + + for (i=j=1;indf;i++){ + flatAxis = reduxGetSrcFlatAxis(ctx, j-1); + sortAxis = reduxGetSrcFlatAxis(ctx, i); + + if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){ + *reduxGetSrcFlatAxis(ctx, j++) = *sortAxis; + } } + ctx->ndf = j; + + return reduxSelectWarpAxes(ctx); } /** - * @brief Returns whether the generated kernel internally requires a dst - * argument. - * - * This is semantically subtly different from reduxHasDst(). The main - * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX - * reductions; Either *might* require a dst buffer, which will have to be - * allocated, even though it will be discared. + * @brief Select the warp axes in such a way as to maximize memory bandwidth. */ -static int reduxKernelRequiresDst (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return reduxIsSmallCodeModel(ctx); - default: - return 1; - } -} +static int reduxSelectWarpAxes (redux_ctx* ctx){ + axis_desc* a; + int i; + size_t aL; -/** - * @brief Returns whether the generated kernel internally requires a dstArg - * argument. - * - * This is semantically subtly different from reduxHasDstArg(), since it asks - * whether the reduction, even though it does not accept a dstArg argument, - * still requires a dstArg internally. - */ -static int reduxKernelRequiresDstArg (redux_ctx* ctx){ /** - * At present there exists no reduction whose implementation requires - * a dstArg but whose interface does not. - * - * E.g. the max() and min() reductions do NOT currently require a temporary - * buffer for indexes, and will not in the foreseeable future. + * NOTE: At this point it is possible for there to be no axes + * (ctx->ndf == 0), but this will only occur if all axes of the original + * tensor were length-1 (i.e., if this was a scalar masquerading as a + * multidimensional tensor). + * + * We check for this case and simulate a 1-dimensional, 1-length tensor. */ - return reduxRequiresDstArg(ctx); -} + if(ctx->ndf == 0){ + axisInit (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0); + axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0); + ctx->ndf = 1; + } -/** - * @brief Check whether we can add another reduction axis or free axis - * to the hardware axis list for either the primary or secondary kernel. - */ -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType){ - int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; - int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; - int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + /** + * Select Warp Axes. + * + * Using a particular heuristic order (*), sort the axis list by + * suitability for belonging to the warp. Then, pick the first few axes, + * until the product of their lengths exceeds the warp size. + * + * (*) See documentation of value-comparison function. + */ - if (kernelNdh >= MAX_HW_DIMS){ - return 0; - }else{ - return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: - kernelNdhd < ctx->ndd; + for(i=0;indf;i++){ + ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i); } -} -/** - * @brief Append the largest reduction axis or free axis that isn't yet - * in the hardware axis list for either the primary or secondary kernel - * into said hardware axis list. - */ + qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType){ - int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; - int* hwAxisList, * ndh, * ndhr, * ndhd; - size_t v, maxV = 0; + for (i=0;indf;i++){ + a = reduxGetSrcSortAxis(ctx, i); + aL = axisGetLen(a); + if (aL <= 1){break;} + + ctx->prodWarpAxes *= aL; + if (ctx->prodWarpAxes <= ctx->warpSize){ + axisMarkWarp(a, aL); + ctx->ndw++; + }else{ + /** + * The product of warp lengths just exceeded warpSize. We backtrack + * by undoing the multiplication by aL. We then check whether we + * can "split" this axis by extracting at least a factor of 2 into + * warpLen. If yes, we mark is as the (only) warp axis that is + * split by setting its warpLen to something neither 0 nor len. + */ + + ctx->prodWarpAxes /= aL; + aL = ctx->warpSize/ctx->prodWarpAxes; + if (aL >= 2){ + axisMarkWarp(a, aL); + ctx->prodWarpAxes *= aL; + ctx->splitWarpAxis = i; + ctx->ndw++; + ctx->ndp++; + } + break; + } + } - /* Get pointers to the correct kernel's variables */ - hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: - ctx->aux.axisList; - ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: - &ctx->aux.ndh; - ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: - &ctx->aux.ndhr; - ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: - &ctx->aux.ndhd; - /* Find */ - for (i=0;inds;i++){ - isInHwList = axisInSet(i, hwAxisList, *ndh, 0); - isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); - isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: - !isInReduxList; - v = ctx->src->dimensions[i]; - isLargestSoFar = v >= maxV; + return reduxSelectNumStages(ctx); +} - if (!isInHwList && isInDesiredList && isLargestSoFar){ - maxV = v; - maxI = i; - } - } +/** + * @brief Select the number of stages of the reduction. + * + * This depends a lot on the GPU and the specific size of the reduction. + */ - /* Append */ - hwAxisList[(*ndh)++] = maxI; - if (axisType == AXIS_REDUX){ - (*ndhr)++; +static int reduxSelectNumStages (redux_ctx* ctx){ + size_t parallelism = 2 * ctx->numProcs * ctx->maxLg; + + if(ctx->zeroRdxAxes || /* Reduction is empty? */ + ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */ + ctx->prodFreeAxes > parallelism ){ /* # of destination elements large enough to fill available parallelism? */ + ctx->numStages = 1; }else{ - (*ndhd)++; + ctx->numStages = 2; } + + return reduxSelectHwAxes(ctx); } /** @@ -1254,7 +1846,67 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, * largest free axes are selected. */ -static int reduxSelectHwAxes (redux_ctx* ctx){ +static int reduxSelectHwAxes (redux_ctx* ctx){ + int ret; + + ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); + ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); + if (!ctx->srcAxisList || + !ctx->dstDims ){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */ + /** + * *** IT IS NOW SAFE TO CALL: *** + * - reduxIsLargeModel() + * - reduxIsSmallModel() + * - reduxKernelRequiresDst() + * - reduxKernelRequiresDstArg() + */ + + + /** + * Allocate workspaces. + * + * Certain reductions may require a workspace that isn't provided by the user. + * For instance, **when using the small code model**, argmin/argmax require + * a dst buffer, but the user didn't supply one (as he would have for + * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. + * + * Otherwise we use the user-supplied buffers. + */ + + if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ + ctx->wsDst = malloc(sizeof(*ctx->wsDst)); + if (!ctx->wsDst){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDst = ctx->dst; + } + if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ + ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); + if (!ctx->wsDstArg){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDstArg = ctx->dstArg; + } + + if (reduxIsLargeCodeModel(ctx)){ while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); @@ -1346,8 +1998,8 @@ static void reduxAppendTensorCallArgs (redux_ctx* ctx, static void reduxAppendMacroDefs (redux_ctx* ctx){ int i; - srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); - srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); + srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); + srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if (i##idx >= i##idx##Dim){continue;}\n"); /* srcVal indexer */ srcbAppends (&ctx->srcGen, "#define srcVal (*(const GLOBAL_MEM S*)("); @@ -1471,10 +2123,10 @@ static void reduxAppendPrototype (redux_ctx* ctx){ reduxAppendTensorDeclArgs(ctx, "S", "src"); srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* srcSize"); srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* chunkSize"); - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ reduxAppendTensorDeclArgs(ctx, "T", "dst"); } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ reduxAppendTensorDeclArgs(ctx, "A", "dstArg"); } srcbEndList (&ctx->srcGen); @@ -1519,12 +2171,12 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ for (i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); } - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dDStep = dstSteps[%d];\n", i, i); } } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); } @@ -1614,14 +2266,14 @@ static void reduxAppendLoops (redux_ctx* ctx){ case GA_REDUCE_ARGMIN: case GA_REDUCE_MINANDARGMIN: srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n" - "\t\t\tif(rdxK == k){\n" + "\t\t\tif (rdxK == k){\n" "\t\t\t\trdxA = rdxIdx;\n" "\t\t\t}\n"); break; case GA_REDUCE_ARGMAX: case GA_REDUCE_MAXANDARGMAX: srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n" - "\t\t\tif(rdxK == k){\n" + "\t\t\tif (rdxK == k){\n" "\t\t\t\trdxA = rdxIdx;\n" "\t\t\t}\n"); break; @@ -2087,14 +2739,18 @@ static int reduxInvoke (redux_ctx* ctx){ * Cleanup */ -static int reduxCleanup (redux_ctx* ctx, int ret){ +static int reduxCleanup (redux_ctx* ctx, int ret){ if (ctx->dst != ctx->wsDst){ - GpuArray_clear(ctx->wsDst); + if(ctx->wsDst){ + GpuArray_clear(ctx->wsDst); + } free(ctx->wsDst); ctx->wsDst = NULL; } if (ctx->dstArg != ctx->wsDstArg){ - GpuArray_clear(ctx->wsDstArg); + if(ctx->wsDstArg){ + GpuArray_clear(ctx->wsDstArg); + } free(ctx->wsDstArg); ctx->wsDstArg = NULL; } @@ -2124,3 +2780,20 @@ static int reduxCleanup (redux_ctx* ctx, int ret){ return ret; } + +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...){ +#if DEBUG + FILE* fp = stderr; + + va_list ap; + va_start(ap, fmt); + vfprintf(fp, fmt, ap); + va_end(ap); + fflush(fp); +#else + (void)fmt; +#endif + + return reduxCleanup(ctx, ret); +} diff --git a/tests/check_reduction.c b/tests/check_reduction.c index b4e919fcf9..94d2aac8ff 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -113,7 +113,7 @@ START_TEST(test_maxandargmax_reduction){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -205,7 +205,7 @@ START_TEST(test_maxandargmax_idxtranspose){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -294,7 +294,7 @@ START_TEST(test_maxandargmax_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -393,7 +393,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -481,7 +481,7 @@ START_TEST(test_minandargmin_reduction){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -570,7 +570,7 @@ START_TEST(test_minandargmin_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -669,7 +669,7 @@ START_TEST(test_minandargmin_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -754,7 +754,7 @@ START_TEST(test_argmax_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -836,7 +836,7 @@ START_TEST(test_argmax_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -929,7 +929,7 @@ START_TEST(test_argmax_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -1011,7 +1011,7 @@ START_TEST(test_argmin_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -1093,7 +1093,7 @@ START_TEST(test_argmin_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -1186,7 +1186,7 @@ START_TEST(test_argmin_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -1265,7 +1265,7 @@ START_TEST(test_max_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); @@ -1343,7 +1343,7 @@ START_TEST(test_max_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); @@ -1431,7 +1431,7 @@ START_TEST(test_max_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); @@ -1507,7 +1507,7 @@ START_TEST(test_min_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); @@ -1585,7 +1585,7 @@ START_TEST(test_min_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); @@ -1673,7 +1673,7 @@ START_TEST(test_min_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); @@ -1750,7 +1750,7 @@ START_TEST(test_sum_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -1826,7 +1826,7 @@ START_TEST(test_sum_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -1912,7 +1912,7 @@ START_TEST(test_sum_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -1986,7 +1986,7 @@ START_TEST(test_prod_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2062,7 +2062,7 @@ START_TEST(test_prod_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2148,7 +2148,7 @@ START_TEST(test_prod_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2225,7 +2225,7 @@ START_TEST(test_prodnz_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2304,7 +2304,7 @@ START_TEST(test_prodnz_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2393,7 +2393,7 @@ START_TEST(test_prodnz_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2475,7 +2475,7 @@ START_TEST(test_and_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_and (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2559,7 +2559,7 @@ START_TEST(test_and_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_and (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2653,7 +2653,7 @@ START_TEST(test_and_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_and (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2735,7 +2735,7 @@ START_TEST(test_or_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_or (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2819,7 +2819,7 @@ START_TEST(test_or_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_or (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2913,7 +2913,7 @@ START_TEST(test_or_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_or (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2991,7 +2991,7 @@ START_TEST(test_xor_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3071,7 +3071,7 @@ START_TEST(test_xor_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3161,7 +3161,7 @@ START_TEST(test_xor_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3239,7 +3239,7 @@ START_TEST(test_any_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_any (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3319,7 +3319,7 @@ START_TEST(test_any_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_any (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3409,7 +3409,7 @@ START_TEST(test_any_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_any (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3487,7 +3487,7 @@ START_TEST(test_all_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_all (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3567,7 +3567,7 @@ START_TEST(test_all_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_all (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3657,7 +3657,7 @@ START_TEST(test_all_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_all (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD));