arm_compute v17.05

ARM-software · May 4, 2017 · 46d5927 · 46d5927
1 parent c772c0b
commit 46d5927
Show file tree

Hide file tree

Showing 1,489 changed files with 10,801 additions and 10,899 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -1,9 +1,17 @@
 
 Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
-Documentation available here: [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/) [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
+Documentation available here: 
 
-Binaries available here: [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz) [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
+- [v17.05](https://arm-software.github.io/ComputeLibrary/v17.05/)
+- [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/)
+- [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
+
+Binaries available here: 
+
+- [v17.05](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.05/arm_compute-v17.05-bin.tar.gz)
+- [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz)
+- [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
 
 Support: [email protected]
 

diff --git a/SConstruct b/SConstruct
@@ -20,4 +20,32 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-SConscript('sconscript', variant_dir='build', duplicate=0)
+import os
+
+vars = Variables("scons")
+vars.AddVariables(
+    BoolVariable("debug", "Debug", False),
+    BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
+    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
+    EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
+    EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile")),
+    BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
+    BoolVariable("opencl", "Enable OpenCL support", True),
+    BoolVariable("neon", "Enable Neon support", False),
+    BoolVariable("embed_kernels", "Embed OpenCL kernels in library binary", False),
+    BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
+    BoolVariable("openmp", "Enable OpenMP backend", False),
+    BoolVariable("cppthreads", "Enable C++11 threads backend", True),
+    PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathIsDirCreate),
+    ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", "")
+)
+
+env = Environment(platform='posix', variables = vars, ENV = os.environ)
+
+Help(vars.GenerateHelpText(env))
+
+Export('vars')
+Export('env')
+
+if not GetOption("help"):
+    SConscript('sconscript', variant_dir='#build/%s/arm_compute' % env['build_dir'], duplicate=0)
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
@@ -43,6 +43,9 @@ class ICLTensor : public ITensor
     ICLTensor();
     ICLTensor(const ICLTensor &) = delete;
     ICLTensor &operator=(const ICLTensor &) = delete;
+    ICLTensor(ICLTensor &&)                 = default;
+    ICLTensor &operator=(ICLTensor &&) = default;
+    virtual ~ICLTensor()               = default;
 
     /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
      *

diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
@@ -35,4 +35,9 @@ namespace cl
 {
 static const NDRange Range_128_1 = NDRange(128, 1);
 }
+
+namespace arm_compute
+{
+bool opencl_is_available();
+}
 #endif /* __ARM_COMPUTE_OPENCL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -47,7 +47,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
  */
 class CLGEMMInterleave4x4Kernel : public ICLKernel
 {
@@ -64,7 +64,7 @@ class CLGEMMInterleave4x4Kernel : public ICLKernel
     CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/F16/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);

diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
@@ -183,24 +183,37 @@ inline void for_each(F &&func, T &&arg, Ts &&... args)
     for_each(func, args...);
 }
 
-/** Base case of foldl. Return value. */
+/** Base case of foldl.
+ *
+ * @return value.
+ */
 template <typename F, typename T>
-inline T foldl(F &&, T &&value)
+inline T foldl(F &&, const T &value)
 {
     return value;
 }
 
+/** Base case of foldl.
+ *
+ * @return Function evaluation for value1 and value2
+ */
+template <typename F, typename T, typename U>
+inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
+{
+    return func(value1, value2);
+}
+
 /** Fold left.
  *
  * @param[in] func    Function to be called
  * @param[in] initial Initial value
  * @param[in] value   Argument passed to the function
  * @param[in] values  Remaining arguments
  */
-template <typename F, typename I, typename T, typename... Ts>
-inline I foldl(F &&func, I &&initial, T &&value, Ts &&... values)
+template <typename F, typename I, typename T, typename... Vs>
+inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
 {
-    return foldl(func, func(initial, value), values...);
+    return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
 }
 }
 

diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
@@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =
  *
  * @return The calculated inverse square root.
  */
-inline float32x4_t vinvsqrt_f32(float32x4_t x)
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
     sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
@@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)
  *
  * @return The calculated reciprocal.
  */
-inline float32x4_t vinv_f32(const float32x4_t &x)
+inline float32x4_t vinvq_f32(const float32x4_t &x)
 {
     float32x4_t recip = vrecpeq_f32(x);
     recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
@@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)
  *
  * @return The calculated approximation.
  */
-inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
+inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
 {
     float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
     float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
@@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float
  *
  * @return The calculated exponent.
  */
-inline float32x4_t vexp_f32(const float32x4_t &x)
+inline float32x4_t vexpq_f32(const float32x4_t &x)
 {
     static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
     static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
@@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
     float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
 
     // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
 
     // Reconstruct
     poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
@@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
  *
  * @return The calculated logarithm.
  */
-inline float32x4_t vlog_f32(const float32x4_t &x)
+inline float32x4_t vlogq_f32(const float32x4_t &x)
 {
     static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
     static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
@@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
     float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
 
     // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, log_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
 
     // Reconstruct
     poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
@@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
  *
  * tanh(x) = (e^2x - 1)/(e^2x + 1)
  *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
  * @param val Input vector value in F32 format.
  *
  * @return The calculated Hyperbolic Tangent.
  */
-inline float32x4_t vtanh_f32(const float32x4_t &val)
+inline float32x4_t vtanhq_f32(const float32x4_t &val)
 {
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
-    static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);  // 1.f
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);  // 2.f
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f);  // 5.f
 
-    float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
     float32x4_t num   = vsubq_f32(exp2x, CONST_1);
     float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinv_f32(den));
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
     return tanh;
 }
 
@@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)
  */
 inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
 {
-    return vexp_f32(vmulq_f32(n, vlog_f32(val)));
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
 }
 

diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
@@ -82,7 +82,6 @@ class NEColorConvertKernel : public INEKernel
     using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
     const void           *_input;
     void                 *_output;
-    unsigned int          _num_elems_processed_per_iteration;
     ColorConvertFunction *_func;
 };
 }

diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
@@ -97,13 +97,13 @@ class NEHistogramKernel : public INEKernel
       *
      *  @param[in] win Region on which to execute the kernel
      */
-    void histogram_U8(const Window &win);
+    void histogram_U8(Window win);
     /** Function to perform histogram on the given window where histogram is
      *         of fixed size 256 without ranges and offsets.
      *
      *  @param[in] win Region on which to execute the kernel
      */
-    void histogram_fixed_U8(const Window &win);
+    void histogram_fixed_U8(Window win);
     /** Pre-calculate the pixel windowing for every possible pixel
      *
      * Calculate (V - offset) * numBins / range where V is every possible pixel value.
@@ -115,88 +115,15 @@ class NEHistogramKernel : public INEKernel
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using HistogramFunction = void (NEHistogramKernel::*)(const Window &window);
-    /** Histogram function to use for the particular image types passed to configure() */
-    HistogramFunction _func;
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
 
-private:
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
     const IImage                 *_input;
     IDistribution1D              *_output;
     uint32_t                     *_local_hist;
     uint32_t                     *_window_lut;
     std::mutex                    _hist_mtx;
     static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
 };
-
-/** Interface for the histogram border handling kernel.
- *
- * @note If the image width is not a multiple of the number of elements processed by @ref NEHistogramKernel
- * this kernel is used to handle the leftover columns.
- */
-class NEHistogramBorderKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    NEHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramBorderKernel(const NEHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramBorderKernel &operator=(const NEHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHistogramBorderKernel(NEHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHistogramBorderKernel &operator=(NEHistogramBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEHistogramBorderKernel() = default;
-
-    /** Set the input image and the distribution output.
-     *
-     * @param[in]  input                    Source image. Data type supported: U8.
-     * @param[out] output                   Destination distribution.
-     * @param[in]  window_lut               LUT with precalculated possible window values.
-     * @param[in]  hist_elements_per_thread Pixels per thread that the histogram kernel computes.
-     */
-    void configure(const IImage *input, IDistribution1D *output, uint32_t *window_lut, const unsigned int hist_elements_per_thread);
-    /** Set the input image and the distribution output.
-     *
-     * @note Used for histogram of fixed size equal to 256
-     *
-     * @param[in]  input                    Source image. Data type supported: U8.
-     * @param[out] output                   Destination distribution.
-     * @param[in]  hist_elements_per_thread Pixels per thread that the histogram kernel computes.
-     */
-    void configure(const IImage *input, IDistribution1D *output, const unsigned int hist_elements_per_thread);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Function to perform histogram on the given window
-      *
-     *  @param[in] win Region on which to execute the kernel
-     */
-    void histogram_U8(const Window &win);
-    /** Function to perform histogram on the given window where histogram is
-     *  of fixed size 256 without ranges and offsets.
-     *
-     *  @param[in] win Region on which to execute the kernel
-     */
-    void histogram_fixed_U8(const Window &win);
-    /** Common signature for all the specialised Histogram functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using HistogramBorderFunction = void (NEHistogramBorderKernel::*)(const Window &window);
-    /** Histogram function to use for the particular image types passed to configure() */
-    HistogramBorderFunction _func;
-
-private:
-    const IImage                 *_input;
-    IDistribution1D              *_output;
-    uint32_t                     *_window_lut;
-    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
-};
 }
-
 #endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */