Skip to content

Commit

Permalink
arm_compute v17.05
Browse files Browse the repository at this point in the history
  • Loading branch information
AnthonyBarbier committed May 4, 2017
1 parent c772c0b commit 46d5927
Show file tree
Hide file tree
Showing 1,489 changed files with 10,801 additions and 10,899 deletions.
21 changes: 0 additions & 21 deletions LICENSE

This file was deleted.

12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@

Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues

Documentation available here: [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/) [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
Documentation available here:

Binaries available here: [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz) [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
- [v17.05](https://arm-software.github.io/ComputeLibrary/v17.05/)
- [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/)
- [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)

Binaries available here:

- [v17.05](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.05/arm_compute-v17.05-bin.tar.gz)
- [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz)
- [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)

Support: [email protected]

Expand Down
30 changes: 29 additions & 1 deletion SConstruct
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,32 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

SConscript('sconscript', variant_dir='build', duplicate=0)
import os

vars = Variables("scons")
vars.AddVariables(
BoolVariable("debug", "Debug", False),
BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile")),
BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
BoolVariable("opencl", "Enable OpenCL support", True),
BoolVariable("neon", "Enable Neon support", False),
BoolVariable("embed_kernels", "Embed OpenCL kernels in library binary", False),
BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
BoolVariable("openmp", "Enable OpenMP backend", False),
BoolVariable("cppthreads", "Enable C++11 threads backend", True),
PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathIsDirCreate),
("extra_cxx_flags", "Extra CXX flags to be appended to the build command", "")
)

env = Environment(platform='posix', variables = vars, ENV = os.environ)

Help(vars.GenerateHelpText(env))

Export('vars')
Export('env')

if not GetOption("help"):
SConscript('sconscript', variant_dir='#build/%s/arm_compute' % env['build_dir'], duplicate=0)
3 changes: 3 additions & 0 deletions arm_compute/core/CL/ICLTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class ICLTensor : public ITensor
ICLTensor();
ICLTensor(const ICLTensor &) = delete;
ICLTensor &operator=(const ICLTensor &) = delete;
ICLTensor(ICLTensor &&) = default;
ICLTensor &operator=(ICLTensor &&) = default;
virtual ~ICLTensor() = default;

/** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
*
Expand Down
5 changes: 5 additions & 0 deletions arm_compute/core/CL/OpenCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@ namespace cl
{
static const NDRange Range_128_1 = NDRange(128, 1);
}

namespace arm_compute
{
bool opencl_is_available();
}
#endif /* __ARM_COMPUTE_OPENCL_H__ */
4 changes: 2 additions & 2 deletions arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class ICLTensor;
* \end{array} \right)
* @f]
*
* After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
* After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
*/
class CLGEMMInterleave4x4Kernel : public ICLKernel
{
Expand All @@ -64,7 +64,7 @@ class CLGEMMInterleave4x4Kernel : public ICLKernel
CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
/** Initialise the kernel's input and output.
*
* @param[in] input Input tensor. Data types supported: U8/F16/F32
* @param[in] input Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input
*/
void configure(const ICLTensor *input, ICLTensor *output);
Expand Down
23 changes: 18 additions & 5 deletions arm_compute/core/Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,24 +183,37 @@ inline void for_each(F &&func, T &&arg, Ts &&... args)
for_each(func, args...);
}

/** Base case of foldl. Return value. */
/** Base case of foldl.
*
* @return value.
*/
template <typename F, typename T>
inline T foldl(F &&, T &&value)
inline T foldl(F &&, const T &value)
{
return value;
}

/** Base case of foldl.
*
* @return Function evaluation for value1 and value2
*/
template <typename F, typename T, typename U>
inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
{
return func(value1, value2);
}

/** Fold left.
*
* @param[in] func Function to be called
* @param[in] initial Initial value
* @param[in] value Argument passed to the function
* @param[in] values Remaining arguments
*/
template <typename F, typename I, typename T, typename... Ts>
inline I foldl(F &&func, I &&initial, T &&value, Ts &&... values)
template <typename F, typename I, typename T, typename... Vs>
inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
{
return foldl(func, func(initial, value), values...);
return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
}
}

Expand Down
31 changes: 18 additions & 13 deletions arm_compute/core/NEON/NEMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =
*
* @return The calculated inverse square root.
*/
inline float32x4_t vinvsqrt_f32(float32x4_t x)
inline float32x4_t vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
Expand All @@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)
*
* @return The calculated reciprocal.
*/
inline float32x4_t vinv_f32(const float32x4_t &x)
inline float32x4_t vinvq_f32(const float32x4_t &x)
{
float32x4_t recip = vrecpeq_f32(x);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
Expand All @@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)
*
* @return The calculated approximation.
*/
inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
{
float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x);
float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x);
Expand All @@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float
*
* @return The calculated exponent.
*/
inline float32x4_t vexp_f32(const float32x4_t &x)
inline float32x4_t vexpq_f32(const float32x4_t &x)
{
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
Expand All @@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);

// Polynomial Approximation
float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);

// Reconstruct
poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
Expand All @@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
*
* @return The calculated logarithm.
*/
inline float32x4_t vlog_f32(const float32x4_t &x)
inline float32x4_t vlogq_f32(const float32x4_t &x)
{
static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
Expand All @@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));

// Polynomial Approximation
float32x4_t poly = vtaylor_poly_f32(val, log_tab);
float32x4_t poly = vtaylor_polyq_f32(val, log_tab);

// Reconstruct
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
Expand All @@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
*
* tanh(x) = (e^2x - 1)/(e^2x + 1)
*
* @note We clamp x to [-5,5] to avoid overflowing issues.
*
* @param val Input vector value in F32 format.
*
* @return The calculated Hyperbolic Tangent.
*/
inline float32x4_t vtanh_f32(const float32x4_t &val)
inline float32x4_t vtanhq_f32(const float32x4_t &val)
{
static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f); // 5.f

float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
float32x4_t num = vsubq_f32(exp2x, CONST_1);
float32x4_t den = vaddq_f32(exp2x, CONST_1);
float32x4_t tanh = vmulq_f32(num, vinv_f32(den));
float32x4_t tanh = vmulq_f32(num, vinvq_f32(den));
return tanh;
}

Expand All @@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)
*/
inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
{
return vexp_f32(vmulq_f32(n, vlog_f32(val)));
return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
}
}

Expand Down
1 change: 0 additions & 1 deletion arm_compute/core/NEON/kernels/NEColorConvertKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ class NEColorConvertKernel : public INEKernel
using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
const void *_input;
void *_output;
unsigned int _num_elems_processed_per_iteration;
ColorConvertFunction *_func;
};
}
Expand Down
81 changes: 4 additions & 77 deletions arm_compute/core/NEON/kernels/NEHistogramKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,13 @@ class NEHistogramKernel : public INEKernel
*
* @param[in] win Region on which to execute the kernel
*/
void histogram_U8(const Window &win);
void histogram_U8(Window win);
/** Function to perform histogram on the given window where histogram is
* of fixed size 256 without ranges and offsets.
*
* @param[in] win Region on which to execute the kernel
*/
void histogram_fixed_U8(const Window &win);
void histogram_fixed_U8(Window win);
/** Pre-calculate the pixel windowing for every possible pixel
*
* Calculate (V - offset) * numBins / range where V is every possible pixel value.
Expand All @@ -115,88 +115,15 @@ class NEHistogramKernel : public INEKernel
*
* @param[in] window Region on which to execute the kernel.
*/
using HistogramFunction = void (NEHistogramKernel::*)(const Window &window);
/** Histogram function to use for the particular image types passed to configure() */
HistogramFunction _func;
using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);

private:
HistogramFunctionPtr _func; ///< Histogram function to use for the particular image types passed to configure()
const IImage *_input;
IDistribution1D *_output;
uint32_t *_local_hist;
uint32_t *_window_lut;
std::mutex _hist_mtx;
static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
};

/** Interface for the histogram border handling kernel.
*
* @note If the image width is not a multiple of the number of elements processed by @ref NEHistogramKernel
* this kernel is used to handle the leftover columns.
*/
class NEHistogramBorderKernel : public INEKernel
{
public:
/** Default constructor */
NEHistogramBorderKernel();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEHistogramBorderKernel(const NEHistogramBorderKernel &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEHistogramBorderKernel &operator=(const NEHistogramBorderKernel &) = delete;
/** Allow instances of this class to be moved */
NEHistogramBorderKernel(NEHistogramBorderKernel &&) = default;
/** Allow instances of this class to be moved */
NEHistogramBorderKernel &operator=(NEHistogramBorderKernel &&) = default;
/** Default destructor */
~NEHistogramBorderKernel() = default;

/** Set the input image and the distribution output.
*
* @param[in] input Source image. Data type supported: U8.
* @param[out] output Destination distribution.
* @param[in] window_lut LUT with precalculated possible window values.
* @param[in] hist_elements_per_thread Pixels per thread that the histogram kernel computes.
*/
void configure(const IImage *input, IDistribution1D *output, uint32_t *window_lut, const unsigned int hist_elements_per_thread);
/** Set the input image and the distribution output.
*
* @note Used for histogram of fixed size equal to 256
*
* @param[in] input Source image. Data type supported: U8.
* @param[out] output Destination distribution.
* @param[in] hist_elements_per_thread Pixels per thread that the histogram kernel computes.
*/
void configure(const IImage *input, IDistribution1D *output, const unsigned int hist_elements_per_thread);

// Inherited methods overridden:
void run(const Window &window) override;
bool is_parallelisable() const override;

private:
/** Function to perform histogram on the given window
*
* @param[in] win Region on which to execute the kernel
*/
void histogram_U8(const Window &win);
/** Function to perform histogram on the given window where histogram is
* of fixed size 256 without ranges and offsets.
*
* @param[in] win Region on which to execute the kernel
*/
void histogram_fixed_U8(const Window &win);
/** Common signature for all the specialised Histogram functions
*
* @param[in] window Region on which to execute the kernel.
*/
using HistogramBorderFunction = void (NEHistogramBorderKernel::*)(const Window &window);
/** Histogram function to use for the particular image types passed to configure() */
HistogramBorderFunction _func;

private:
const IImage *_input;
IDistribution1D *_output;
uint32_t *_window_lut;
static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
};
}

#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */
Loading

0 comments on commit 46d5927

Please sign in to comment.